diff options
Diffstat (limited to 'drivers/infiniband/core')
74 files changed, 19670 insertions, 12785 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 69dee36e0e89..f483e0c12444 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -6,24 +6,23 @@ obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y) -obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y) ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ - device.o fmr_pool.o cache.o netlink.o \ + device.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - nldev.o restrack.o + nldev.o restrack.o counters.o ib_core_uverbs.o \ + trace.o lag.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o -ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o -ib_cm-y := cm.o +ib_cm-y := cm.o cm_trace.o iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o -rdma_cm-y := cma.o +CFLAGS_cma_trace.o += -I$(src) +rdma_cm-y := cma.o cma_trace.o rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o @@ -31,11 +30,17 @@ rdma_ucm-y := ucma.o ib_umad-y := user_mad.o -ib_ucm-y := ucm.o - ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ + uverbs_std_types_dmah.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ - uverbs_uapi.o uverbs_std_types_device.o + uverbs_uapi.o uverbs_std_types_device.o \ + uverbs_std_types_async_fd.o \ + uverbs_std_types_srq.o \ + uverbs_std_types_wq.o \ + uverbs_std_types_qp.o \ + ucaps.o +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o +ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 0dce94e3c495..61596cda2b65 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -37,14 +37,14 @@ #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/workqueue.h> -#include <linux/module.h> #include <net/arp.h> #include <net/neighbour.h> #include <net/route.h> #include <net/netevent.h> -#include <net/addrconf.h> +#include <net/ipv6_stubs.h> #include <net/ip6_route.h> #include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> #include <rdma/ib_sa.h> #include <rdma/ib.h> #include <rdma/rdma_netlink.h> @@ -75,7 +75,9 @@ static struct workqueue_struct *addr_wq; static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, - .len = sizeof(struct rdma_nla_ls_gid)}, + .len = sizeof(struct rdma_nla_ls_gid), + .validation_type = NLA_VALIDATE_MIN, + .min = sizeof(struct rdma_nla_ls_gid)}, }; static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) @@ -86,8 +88,8 @@ static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) return false; - ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), - nlmsg_len(nlh), ib_nl_addr_policy, NULL); + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_addr_policy, NULL); if (ret) return false; @@ -138,7 +140,7 @@ int ib_nl_handle_ip_res_resp(struct sk_buff *skb, if (ib_nl_is_good_ip_resp(nlh)) ib_nl_process_good_ip_rsep(nlh); - return skb->len; + return 0; } static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, @@ -182,7 +184,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL); + rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. @@ -336,7 +338,7 @@ static int dst_fetch_ha(const struct dst_entry *dst, neigh_event_send(n, NULL); ret = -ENODATA; } else { - memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN); + neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev); } neigh_release(n); @@ -346,16 +348,10 @@ static int dst_fetch_ha(const struct dst_entry *dst, static bool has_gateway(const struct dst_entry *dst, sa_family_t family) { - struct rtable *rt; - struct rt6_info *rt6; - - if (family == AF_INET) { - rt = container_of(dst, struct rtable, dst); - return rt->rt_uses_gateway; - } + if (family == AF_INET) + return dst_rtable(dst)->rt_uses_gateway; - rt6 = container_of(dst, struct rt6_info, dst); - return rt6->rt6i_flags & RTF_GATEWAY; + return dst_rt6_info(dst)->rt6i_flags & RTF_GATEWAY; } static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, @@ -370,6 +366,8 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; + might_sleep(); + /* If we have a gateway in IB mode then it must be an IB network */ if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) return ib_nl_fetch_ha(dev_addr, daddr, seq, family); @@ -420,16 +418,15 @@ static int addr6_resolve(struct sockaddr *src_sock, (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; - int ret; memset(&fl6, 0, sizeof fl6); fl6.daddr = dst_in->sin6_addr; fl6.saddr = src_in->sin6_addr; fl6.flowi6_oif = addr->bound_dev_if; - ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6); - if (ret < 0) - return ret; + dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + return PTR_ERR(dst); if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; @@ -449,63 +446,41 @@ static int addr6_resolve(struct sockaddr *src_sock, } #endif +static bool is_dst_local(const struct dst_entry *dst) +{ + if (dst->ops->family == AF_INET) + return !!(dst_rtable(dst)->rt_type & RTN_LOCAL); + else if (dst->ops->family == AF_INET6) + return !!(dst_rt6_info(dst)->rt6i_flags & RTF_LOCAL); + else + return false; +} + static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, - unsigned int ndev_flags, u32 seq) { - int ret = 0; - - if (ndev_flags & IFF_LOOPBACK) { + if (is_dst_local(dst)) { + /* When the destination is local entry, source and destination + * are same. Skip the neighbour lookup. + */ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - } else { - if (!(ndev_flags & IFF_NOARP)) { - /* If the device doesn't do ARP internally */ - ret = fetch_ha(dst, addr, dst_in, seq); - } + return 0; } - return ret; -} - -static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, - const struct sockaddr *dst_in, - const struct dst_entry *dst, - const struct net_device *ndev) -{ - int ret = 0; - - if (dst->dev->flags & IFF_LOOPBACK) - ret = rdma_translate_ip(dst_in, dev_addr); - else - rdma_copy_src_l2_addr(dev_addr, dst->dev); - - /* - * If there's a gateway and type of device not ARPHRD_INFINIBAND, - * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the - * network type accordingly. - */ - if (has_gateway(dst, dst_in->sa_family) && - ndev->type != ARPHRD_INFINIBAND) - dev_addr->network = dst_in->sa_family == AF_INET ? - RDMA_NETWORK_IPV4 : - RDMA_NETWORK_IPV6; - else - dev_addr->network = RDMA_NETWORK_IB; - return ret; + return fetch_ha(dst, addr, dst_in, seq); } static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, - unsigned int *ndev_flags, const struct sockaddr *dst_in, const struct dst_entry *dst) { struct net_device *ndev = READ_ONCE(dst->dev); - *ndev_flags = ndev->flags; /* A physical device must be the RDMA device to use */ - if (ndev->flags & IFF_LOOPBACK) { + if (is_dst_local(dst)) { + int ret; /* * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or * loopback IP address. So if route is resolved to loopback @@ -515,9 +490,27 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); if (IS_ERR(ndev)) return -ENODEV; + ret = rdma_translate_ip(dst_in, dev_addr); + if (ret) + return ret; + } else { + rdma_copy_src_l2_addr(dev_addr, dst->dev); } - return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); + /* + * If there's a gateway and type of device not ARPHRD_INFINIBAND, + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ + if (has_gateway(dst, dst_in->sa_family) && + ndev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : + RDMA_NETWORK_IPV6; + else + dev_addr->network = RDMA_NETWORK_IB; + + return 0; } static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) @@ -554,7 +547,6 @@ static int addr_resolve(struct sockaddr *src_in, u32 seq) { struct dst_entry *dst = NULL; - unsigned int ndev_flags = 0; struct rtable *rt = NULL; int ret; @@ -591,7 +583,7 @@ static int addr_resolve(struct sockaddr *src_in, rcu_read_unlock(); goto done; } - ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); + ret = rdma_set_src_addr_rcu(addr, dst_in, dst); rcu_read_unlock(); /* @@ -599,7 +591,7 @@ static int addr_resolve(struct sockaddr *src_in, * only if src addr translation didn't fail. */ if (!ret && resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); + ret = addr_resolve_neigh(dst, dst_in, addr, seq); if (src_in->sa_family == AF_INET) ip_rt_put(rt); @@ -645,13 +637,12 @@ static void process_one_req(struct work_struct *_work) req->callback = NULL; spin_lock_bh(&lock); + /* + * Although the work will normally have been canceled by the workqueue, + * it can still be requeued as long as it is on the req_list. + */ + cancel_delayed_work(&req->work); if (!list_empty(&req->list)) { - /* - * Although the work will normally have been canceled by the - * workqueue, it can still be requeued as long as it is on the - * req_list. - */ - cancel_delayed_work(&req->work); list_del_init(&req->list); kfree(req); } @@ -727,11 +718,13 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, struct rdma_dev_addr dev_addr = {}; int ret; + might_sleep(); + if (rec->roce.route_resolved) return 0; - rdma_gid2ip(&sgid._sockaddr, &rec->sgid); - rdma_gid2ip(&dgid._sockaddr, &rec->dgid); + rdma_gid2ip((struct sockaddr *)&sgid, &rec->sgid); + rdma_gid2ip((struct sockaddr *)&dgid, &rec->dgid); if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family) return -EINVAL; @@ -742,7 +735,7 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, dev_addr.net = &init_net; dev_addr.sgid_attr = attr; - ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, + ret = addr_resolve((struct sockaddr *)&sgid, (struct sockaddr *)&dgid, &dev_addr, false, true, 0); if (ret) return ret; @@ -814,22 +807,22 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; union { - struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; int ret; - rdma_gid2ip(&sgid_addr._sockaddr, sgid); - rdma_gid2ip(&dgid_addr._sockaddr, dgid); + rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid); + rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); dev_addr.net = &init_net; dev_addr.sgid_attr = sgid_attr; init_completion(&ctx.comp); - ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, true, &ctx); + ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr, + (struct sockaddr *)&dgid_addr, &dev_addr, 1000, + resolve_cb, true, &ctx); if (ret) return ret; diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index f82b4260de42..25a060a28301 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -59,7 +59,16 @@ __ib_get_agent_port(const struct ib_device *device, int port_num) struct ib_agent_port_private *entry; list_for_each_entry(entry, &ib_agent_port_list, port_list) { - if (entry->agent[1]->device == device && + /* Need to check both agent[0] and agent[1], as an agent port + * may only have one of them + */ + if (entry->agent[0] && + entry->agent[0]->device == device && + entry->agent[0]->port_num == port_num) + return entry; + + if (entry->agent[1] && + entry->agent[1]->device == device && entry->agent[1]->port_num == port_num) return entry; } @@ -101,8 +110,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh * agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", - PTR_ERR(ah)); + dev_err(&device->dev, "ib_create_ah_from_wc error %pe\n", ah); return; } @@ -172,14 +180,16 @@ int ib_agent_port_open(struct ib_device *device, int port_num) } } - /* Obtain send only MAD agent for GSI QP */ - port_priv->agent[1] = ib_register_mad_agent(device, port_num, - IB_QPT_GSI, NULL, 0, - &agent_send_handler, - NULL, NULL, 0); - if (IS_ERR(port_priv->agent[1])) { - ret = PTR_ERR(port_priv->agent[1]); - goto error3; + if (rdma_cap_ib_cm(device, port_num)) { + /* Obtain send only MAD agent for GSI QP */ + port_priv->agent[1] = ib_register_mad_agent(device, port_num, + IB_QPT_GSI, NULL, 0, + &agent_send_handler, + NULL, NULL, 0); + if (IS_ERR(port_priv->agent[1])) { + ret = PTR_ERR(port_priv->agent[1]); + goto error3; + } } spin_lock_irqsave(&ib_agent_port_list_lock, flags); @@ -212,7 +222,8 @@ int ib_agent_port_close(struct ib_device *device, int port_num) list_del(&port_priv->port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - ib_unregister_mad_agent(port_priv->agent[1]); + if (port_priv->agent[1]) + ib_unregister_mad_agent(port_priv->agent[1]); if (port_priv->agent[0]) ib_unregister_mad_agent(port_priv->agent[0]); diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 7b04590f307f..81cf3c902e81 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -33,7 +33,7 @@ * SOFTWARE. */ -#include <linux/module.h> +#include <linux/if_vlan.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> @@ -46,14 +46,13 @@ struct ib_pkey_cache { int table_len; - u16 table[0]; + u16 table[] __counted_by(table_len); }; struct ib_update_work { struct work_struct work; - struct ib_device *device; - u8 port_num; - bool enforce_security; + struct ib_event event; + bool enforce_security; }; union ib_gid zgid; @@ -78,11 +77,22 @@ enum gid_table_entry_state { GID_TABLE_ENTRY_PENDING_DEL = 3, }; +struct roce_gid_ndev_storage { + struct rcu_head rcu_head; + struct net_device *ndev; +}; + struct ib_gid_table_entry { struct kref kref; struct work_struct del_work; struct ib_gid_attr attr; void *context; + /* Store the ndev pointer to release reference later on in + * call_rcu context because by that time gid_table_entry + * and attr might be already freed. So keep a copy of it. + * ndev_storage is freed by rcu callback. + */ + struct roce_gid_ndev_storage *ndev_storage; enum gid_table_entry_state state; }; @@ -111,7 +121,7 @@ struct ib_gid_table { u32 default_gid_indices; }; -static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) { struct ib_event event; @@ -119,11 +129,15 @@ static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) event.element.port_num = port; event.event = IB_EVENT_GID_CHANGE; - ib_dispatch_event(&event); + ib_dispatch_event_clients(&event); } static const char * const gid_type_str[] = { + /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for + * user space compatibility reasons. + */ [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; @@ -183,9 +197,9 @@ int ib_cache_gid_parse_type_str(const char *buf) } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); -static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) +static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) { - return device->cache.ports[port - rdma_start_port(device)].gid; + return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) @@ -206,13 +220,27 @@ static void schedule_free_gid(struct kref *kref) queue_work(ib_wq, &entry->del_work); } +static void put_gid_ndev(struct rcu_head *head) +{ + struct roce_gid_ndev_storage *storage = + container_of(head, struct roce_gid_ndev_storage, rcu_head); + + WARN_ON(!storage->ndev); + /* At this point its safe to release netdev reference, + * as all callers working on gid_attr->ndev are done + * using this netdev. + */ + dev_put(storage->ndev); + kfree(storage); +} + static void free_gid_entry_locked(struct ib_gid_table_entry *entry) { struct ib_device *device = entry->attr.device; - u8 port_num = entry->attr.port_num; + u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); - dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__, + dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, port_num, entry->attr.index, entry->attr.gid.raw); write_lock_irq(&table->rwlock); @@ -228,8 +256,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry) /* Now this index is ready to be allocated */ write_unlock_irq(&table->rwlock); - if (entry->attr.ndev) - dev_put(entry->attr.ndev); + if (entry->ndev_storage) + call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); kfree(entry); } @@ -254,7 +282,7 @@ static void free_gid_work(struct work_struct *work) struct ib_gid_table_entry *entry = container_of(work, struct ib_gid_table_entry, del_work); struct ib_device *device = entry->attr.device; - u8 port_num = entry->attr.port_num; + u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); mutex_lock(&table->lock); @@ -266,14 +294,25 @@ static struct ib_gid_table_entry * alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; + struct net_device *ndev; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return NULL; + + ndev = rcu_dereference_protected(attr->ndev, 1); + if (ndev) { + entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), + GFP_KERNEL); + if (!entry->ndev_storage) { + kfree(entry); + return NULL; + } + dev_hold(ndev); + entry->ndev_storage->ndev = ndev; + } kref_init(&entry->kref); memcpy(&entry->attr, attr, sizeof(*attr)); - if (entry->attr.ndev) - dev_hold(entry->attr.ndev); INIT_WORK(&entry->del_work, free_gid_work); entry->state = GID_TABLE_ENTRY_INVALID; return entry; @@ -284,7 +323,7 @@ static void store_gid_entry(struct ib_gid_table *table, { entry->state = GID_TABLE_ENTRY_VALID; - dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n", + dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", __func__, entry->attr.port_num, entry->attr.index, entry->attr.gid.raw); @@ -315,7 +354,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) int ret; if (!attr->ndev) { - dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n", + dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", __func__, attr->port_num, attr->index); return -EINVAL; } @@ -323,7 +362,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) ret = attr->device->ops.add_gid(attr, &entry->context); if (ret) { dev_err(&attr->device->dev, - "%s GID add failed port=%d index=%d\n", + "%s GID add failed port=%u index=%u\n", __func__, attr->port_num, attr->index); return ret; } @@ -340,14 +379,15 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) * @ix: GID entry index to delete * */ -static void del_gid(struct ib_device *ib_dev, u8 port, +static void del_gid(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table, int ix) { + struct roce_gid_ndev_storage *ndev_storage; struct ib_gid_table_entry *entry; lockdep_assert_held(&table->lock); - dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port, + dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); @@ -363,6 +403,13 @@ static void del_gid(struct ib_device *ib_dev, u8 port, if (rdma_cap_roce_gid_table(ib_dev, port)) ib_dev->ops.del_gid(&entry->attr, &entry->context); + ndev_storage = entry->ndev_storage; + if (ndev_storage) { + entry->ndev_storage = NULL; + rcu_assign_pointer(entry->attr.ndev, NULL); + call_rcu(&ndev_storage->rcu_head, put_gid_ndev); + } + put_gid_entry_locked(entry); } @@ -496,7 +543,7 @@ static void make_default_gid(struct net_device *dev, union ib_gid *gid) addrconf_ifid_eui48(&gid->raw[8], dev); } -static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { @@ -535,44 +582,23 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, out_unlock: mutex_unlock(&table->lock); if (ret) - pr_warn("%s: unable to add gid %pI6 error=%d\n", - __func__, gid->raw, ret); + pr_warn_ratelimited("%s: unable to add gid %pI6 error=%d\n", + __func__, gid->raw, ret); return ret; } -int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { - struct net_device *idev; - unsigned long mask; - int ret; - - if (ib_dev->ops.get_netdev) { - idev = ib_dev->ops.get_netdev(ib_dev, port); - if (idev && attr->ndev != idev) { - union ib_gid default_gid; - - /* Adding default GIDs in not permitted */ - make_default_gid(idev, &default_gid); - if (!memcmp(gid, &default_gid, sizeof(*gid))) { - dev_put(idev); - return -EPERM; - } - } - if (idev) - dev_put(idev); - } - - mask = GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_NETDEV; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV; - ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); - return ret; + return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); } static int -_ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +_ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { @@ -601,7 +627,7 @@ out_unlock: return ret; } -int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | @@ -612,7 +638,7 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); } -int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct ib_gid_table *table; @@ -643,11 +669,10 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, * rdma_find_gid_by_port - Returns the GID entry attributes when it finds * a valid GID entry for given search parameters. It searches for the specified * GID value in the local software cache. - * @device: The device to query. + * @ib_dev: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. - * @port_num: The port number of the device where the GID value should be - * searched. + * @port: The port number of the device where the GID value should be searched. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * Returns sgid attributes if the GID is found with valid reference or @@ -658,7 +683,7 @@ const struct ib_gid_attr * rdma_find_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, - u8 port, struct net_device *ndev) + u32 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; @@ -693,7 +718,7 @@ EXPORT_SYMBOL(rdma_find_gid_by_port); /** * rdma_find_gid_by_filter - Returns the GID table attribute where a * specified GID value occurs - * @device: The device to query. + * @ib_dev: The device to query. * @gid: The GID value to search for. * @port: The port number of the device where the GID value could be * searched. @@ -702,13 +727,14 @@ EXPORT_SYMBOL(rdma_find_gid_by_port); * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. + * @context: Private data to pass into the call-back. * * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. * */ const struct ib_gid_attr *rdma_find_gid_by_filter( - struct ib_device *ib_dev, const union ib_gid *gid, u8 port, + struct ib_device *ib_dev, const union ib_gid *gid, u32 port, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context) @@ -765,10 +791,9 @@ err_free_table: return NULL; } -static void release_gid_table(struct ib_device *device, u8 port, +static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { - bool leak = false; int i; if (!table) @@ -777,43 +802,35 @@ static void release_gid_table(struct ib_device *device, u8 port, for (i = 0; i < table->sz; i++) { if (is_gid_entry_free(table->data_vec[i])) continue; - if (kref_read(&table->data_vec[i]->kref) > 1) { - dev_err(&device->dev, - "GID entry ref leak for index %d ref=%d\n", i, - kref_read(&table->data_vec[i]->kref)); - leak = true; - } + + WARN_ONCE(true, + "GID entry ref leak for dev %s index %d ref=%u\n", + dev_name(&device->dev), i, + kref_read(&table->data_vec[i]->kref)); } - if (leak) - return; + mutex_destroy(&table->lock); kfree(table->data_vec); kfree(table); } -static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, +static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { int i; - bool deleted = false; if (!table) return; mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { - if (is_gid_entry_valid(table->data_vec[i])) { + if (is_gid_entry_valid(table->data_vec[i])) del_gid(ib_dev, port, table, i); - deleted = true; - } } mutex_unlock(&table->lock); - - if (deleted) - dispatch_gid_change_event(ib_dev, port); } -void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) @@ -846,7 +863,7 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, } } -static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, +static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { unsigned int i; @@ -863,31 +880,27 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, static void gid_table_release_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + u32 p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - release_gid_table(ib_dev, port, table); - ib_dev->cache.ports[port].gid = NULL; + rdma_for_each_port (ib_dev, p) { + release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); + ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { - u8 port; struct ib_gid_table *table; + u32 rdma_port; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - u8 rdma_port = port + rdma_start_port(ib_dev); - - table = alloc_gid_table( - ib_dev->port_immutable[rdma_port].gid_tbl_len); + rdma_for_each_port (ib_dev, rdma_port) { + table = alloc_gid_table( + ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); - ib_dev->cache.ports[port].gid = table; + ib_dev->port_data[rdma_port].cache.gid = table; } return 0; @@ -898,14 +911,11 @@ rollback_table_setup: static void gid_table_cleanup_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + u32 p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table); - } + rdma_for_each_port (ib_dev, p) + cleanup_gid_table_port(ib_dev, p, + ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) @@ -936,12 +946,12 @@ static int gid_table_setup_one(struct ib_device *ib_dev) * Returns 0 on success or appropriate error code. * */ -int rdma_query_gid(struct ib_device *device, u8 port_num, +int rdma_query_gid(struct ib_device *device, u32 port_num, int index, union ib_gid *gid) { struct ib_gid_table *table; unsigned long flags; - int res = -EINVAL; + int res; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; @@ -949,9 +959,15 @@ int rdma_query_gid(struct ib_device *device, u8 port_num, table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); - if (index < 0 || index >= table->sz || - !is_gid_entry_valid(table->data_vec[index])) + if (index < 0 || index >= table->sz) { + res = -EINVAL; goto done; + } + + if (!is_gid_entry_valid(table->data_vec[index])) { + res = -ENOENT; + goto done; + } memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); res = 0; @@ -963,6 +979,23 @@ done: EXPORT_SYMBOL(rdma_query_gid); /** + * rdma_read_gid_hw_context - Read the HW GID context from GID attribute + * @attr: Potinter to the GID attribute + * + * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding + * to the SGID attr. Callers are required to already be holding the reference + * to an existing GID entry. + * + * Returns the HW GID context + * + */ +void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) +{ + return container_of(attr, struct ib_gid_table_entry, attr)->context; +} +EXPORT_SYMBOL(rdma_read_gid_hw_context); + +/** * rdma_find_gid - Returns SGID attributes if the matching GID is found. * @device: The device to query. * @gid: The GID value to search for. @@ -983,17 +1016,17 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; - u8 p; + u32 p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; - for (p = 0; p < device->phys_port_cnt; p++) { + rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; - table = device->cache.ports[p].gid; + table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { @@ -1012,7 +1045,7 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, - u8 port_num, + u32 port_num, int index, u16 *pkey) { @@ -1023,44 +1056,34 @@ int ib_get_cached_pkey(struct ib_device *device, if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; - if (index < 0 || index >= cache->table_len) + if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; - read_unlock_irqrestore(&device->cache.lock, flags); + read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); -int ib_get_cached_subnet_prefix(struct ib_device *device, - u8 port_num, - u64 *sn_pfx) +void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, + u64 *sn_pfx) { unsigned long flags; - int p; - if (!rdma_is_port_valid(device, port_num)) - return -EINVAL; - - p = port_num - rdma_start_port(device); - read_lock_irqsave(&device->cache.lock, flags); - *sn_pfx = device->cache.ports[p].subnet_prefix; - read_unlock_irqrestore(&device->cache.lock, flags); - - return 0; + read_lock_irqsave(&device->cache_lock, flags); + *sn_pfx = device->port_data[port_num].cache.subnet_prefix; + read_unlock_irqrestore(&device->cache_lock, flags); } EXPORT_SYMBOL(ib_get_cached_subnet_prefix); -int ib_find_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) +int ib_find_cached_pkey(struct ib_device *device, u32 port_num, + u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; @@ -1071,9 +1094,13 @@ int ib_find_cached_pkey(struct ib_device *device, if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; + if (!cache) { + ret = -EINVAL; + goto err; + } *index = -1; @@ -1083,8 +1110,9 @@ int ib_find_cached_pkey(struct ib_device *device, *index = i; ret = 0; break; - } else + } else { partial_ix = i; + } } if (ret && partial_ix >= 0) { @@ -1092,47 +1120,14 @@ int ib_find_cached_pkey(struct ib_device *device, ret = 0; } - read_unlock_irqrestore(&device->cache.lock, flags); +err: + read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); -int ib_find_exact_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int i; - int ret = -ENOENT; - - if (!rdma_is_port_valid(device, port_num)) - return -EINVAL; - - read_lock_irqsave(&device->cache.lock, flags); - - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; - - *index = -1; - - for (i = 0; i < cache->table_len; ++i) - if (cache->table[i] == pkey) { - *index = i; - ret = 0; - break; - } - - read_unlock_irqrestore(&device->cache.lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_find_exact_cached_pkey); - -int ib_get_cached_lmc(struct ib_device *device, - u8 port_num, - u8 *lmc) +int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; int ret = 0; @@ -1140,16 +1135,15 @@ int ib_get_cached_lmc(struct ib_device *device, if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc; - read_unlock_irqrestore(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); + *lmc = device->port_data[port_num].cache.lmc; + read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_lmc); -int ib_get_cached_port_state(struct ib_device *device, - u8 port_num, +int ib_get_cached_port_state(struct ib_device *device, u32 port_num, enum ib_port_state *port_state) { unsigned long flags; @@ -1158,10 +1152,9 @@ int ib_get_cached_port_state(struct ib_device *device, if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); - *port_state = device->cache.ports[port_num - - rdma_start_port(device)].port_state; - read_unlock_irqrestore(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); + *port_state = device->port_data[port_num].cache.port_state; + read_unlock_irqrestore(&device->cache_lock, flags); return ret; } @@ -1184,9 +1177,9 @@ EXPORT_SYMBOL(ib_get_cached_port_state); * code. */ const struct ib_gid_attr * -rdma_get_gid_attr(struct ib_device *device, u8 port_num, int index) +rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) { - const struct ib_gid_attr *attr = ERR_PTR(-EINVAL); + const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); struct ib_gid_table *table; unsigned long flags; @@ -1210,6 +1203,63 @@ done: EXPORT_SYMBOL(rdma_get_gid_attr); /** + * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. + * @device: The device to query. + * @entries: Entries where GID entries are returned. + * @max_entries: Maximum number of entries that can be returned. + * Entries array must be allocated to hold max_entries number of entries. + * + * Returns number of entries on success or appropriate error code. + */ +ssize_t rdma_query_gid_table(struct ib_device *device, + struct ib_uverbs_gid_entry *entries, + size_t max_entries) +{ + const struct ib_gid_attr *gid_attr; + ssize_t num_entries = 0, ret; + struct ib_gid_table *table; + u32 port_num, i; + struct net_device *ndev; + unsigned long flags; + + rdma_for_each_port(device, port_num) { + table = rdma_gid_table(device, port_num); + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + if (!is_gid_entry_valid(table->data_vec[i])) + continue; + if (num_entries >= max_entries) { + ret = -EINVAL; + goto err; + } + + gid_attr = &table->data_vec[i]->attr; + + memcpy(&entries->gid, &gid_attr->gid, + sizeof(gid_attr->gid)); + entries->gid_index = gid_attr->index; + entries->port_num = gid_attr->port_num; + entries->gid_type = gid_attr->gid_type; + ndev = rcu_dereference_protected( + gid_attr->ndev, + lockdep_is_held(&table->rwlock)); + if (ndev) + entries->netdev_ifindex = ndev->ifindex; + + num_entries++; + entries++; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return num_entries; +err: + read_unlock_irqrestore(&table->rwlock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_query_gid_table); + +/** * rdma_put_gid_attr - Release reference to the GID attribute * @attr: Pointer to the GID attribute whose reference * needs to be released. @@ -1265,8 +1315,8 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); struct ib_device *device = entry->attr.device; - struct net_device *ndev = ERR_PTR(-ENODEV); - u8 port_num = entry->attr.port_num; + struct net_device *ndev = ERR_PTR(-EINVAL); + u32 port_num = entry->attr.port_num; struct ib_gid_table *table; unsigned long flags; bool valid; @@ -1275,14 +1325,78 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) read_lock_irqsave(&table->rwlock, flags); valid = is_gid_entry_valid(table->data_vec[attr->index]); - if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP)) - ndev = attr->ndev; + if (valid) { + ndev = rcu_dereference(attr->ndev); + if (!ndev) + ndev = ERR_PTR(-ENODEV); + } read_unlock_irqrestore(&table->rwlock, flags); return ndev; } +EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); + +static int get_lower_dev_vlan(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + u16 *vlan_id = (u16 *)priv->data; + + if (is_vlan_dev(lower_dev)) + *vlan_id = vlan_dev_vlan_id(lower_dev); + + /* We are interested only in first level vlan device, so + * always return 1 to stop iterating over next level devices. + */ + return 1; +} + +/** + * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address + * of a GID entry. + * + * @attr: GID attribute pointer whose L2 fields to be read + * @vlan_id: Pointer to vlan id to fill up if the GID entry has + * vlan id. It is optional. + * @smac: Pointer to smac to fill up for a GID entry. It is optional. + * + * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id + * (if gid entry has vlan) and source MAC, or returns error. + */ +int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, + u16 *vlan_id, u8 *smac) +{ + struct netdev_nested_priv priv = { + .data = (void *)vlan_id, + }; + struct net_device *ndev; + + rcu_read_lock(); + ndev = rcu_dereference(attr->ndev); + if (!ndev) { + rcu_read_unlock(); + return -ENODEV; + } + if (smac) + ether_addr_copy(smac, ndev->dev_addr); + if (vlan_id) { + *vlan_id = 0xffff; + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + } else { + /* If the netdev is upper device and if it's lower + * device is vlan device, consider vlan id of + * the lower vlan device for this gid entry. + */ + netdev_walk_all_lower_dev_rcu(attr->ndev, + get_lower_dev_vlan, &priv); + } + } + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(rdma_read_gid_l2_fields); static int config_non_roce_gid_cache(struct ib_device *device, - u8 port, int gid_tbl_len) + u32 port, struct ib_port_attr *tprops) { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; @@ -1294,7 +1408,7 @@ static int config_non_roce_gid_cache(struct ib_device *device, table = rdma_gid_table(device, port); mutex_lock(&table->lock); - for (i = 0; i < gid_tbl_len; ++i) { + for (i = 0; i < tprops->gid_tbl_len; ++i) { if (!device->ops.query_gid) continue; ret = device->ops.query_gid(device, port, i, &gid_attr.gid); @@ -1304,7 +1418,20 @@ static int config_non_roce_gid_cache(struct ib_device *device, i); goto err; } + + if (rdma_protocol_iwarp(device, port)) { + struct net_device *ndev; + + ndev = ib_device_get_netdev(device, port); + if (!ndev) + continue; + RCU_INIT_POINTER(gid_attr.ndev, ndev); + dev_put(ndev); + } + gid_attr.index = i; + tprops->subnet_prefix = + be64_to_cpu(gid_attr.gid.global.subnet_prefix); add_modify_gid(table, &gid_attr); } err: @@ -1312,21 +1439,22 @@ err: return ret; } -static void ib_cache_update(struct ib_device *device, - u8 port, - bool enforce_security) +static int +ib_cache_update(struct ib_device *device, u32 port, bool update_gids, + bool update_pkeys, bool enforce_security) { struct ib_port_attr *tprops = NULL; - struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; + struct ib_pkey_cache *pkey_cache = NULL; + struct ib_pkey_cache *old_pkey_cache = NULL; int i; int ret; if (!rdma_is_port_valid(device, port)) - return; + return -EINVAL; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) - return; + return -ENOMEM; ret = ib_query_port(device, port, tprops); if (ret) { @@ -1334,44 +1462,55 @@ static void ib_cache_update(struct ib_device *device, goto err; } - if (!rdma_protocol_roce(device, port)) { + if (!rdma_protocol_roce(device, port) && update_gids) { ret = config_non_roce_gid_cache(device, port, - tprops->gid_tbl_len); + tprops); if (ret) goto err; } - pkey_cache = kmalloc(struct_size(pkey_cache, table, - tprops->pkey_tbl_len), - GFP_KERNEL); - if (!pkey_cache) - goto err; - - pkey_cache->table_len = tprops->pkey_tbl_len; + update_pkeys &= !!tprops->pkey_tbl_len; - for (i = 0; i < pkey_cache->table_len; ++i) { - ret = ib_query_pkey(device, port, i, pkey_cache->table + i); - if (ret) { - dev_warn(&device->dev, - "ib_query_pkey failed (%d) for index %d\n", - ret, i); + if (update_pkeys) { + pkey_cache = kmalloc(struct_size(pkey_cache, table, + tprops->pkey_tbl_len), + GFP_KERNEL); + if (!pkey_cache) { + ret = -ENOMEM; goto err; } + + pkey_cache->table_len = tprops->pkey_tbl_len; + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, + pkey_cache->table + i); + if (ret) { + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); + goto err; + } + } } - write_lock_irq(&device->cache.lock); + write_lock_irq(&device->cache_lock); + + if (update_pkeys) { + old_pkey_cache = device->port_data[port].cache.pkey; + device->port_data[port].cache.pkey = pkey_cache; + } + device->port_data[port].cache.lmc = tprops->lmc; - old_pkey_cache = device->cache.ports[port - - rdma_start_port(device)].pkey; + if (device->port_data[port].cache.port_state != IB_PORT_NOP && + device->port_data[port].cache.port_state != tprops->state) + ibdev_info(device, "Port: %d Link %s\n", port, + ib_port_state_to_str(tprops->state)); - device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache; - device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc; - device->cache.ports[port - rdma_start_port(device)].port_state = - tprops->state; + device->port_data[port].cache.port_state = tprops->state; - device->cache.ports[port - rdma_start_port(device)].subnet_prefix = - tprops->subnet_prefix; - write_unlock_irq(&device->cache.lock); + device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; + write_unlock_irq(&device->cache_lock); if (enforce_security) ib_security_cache_change(device, @@ -1380,85 +1519,110 @@ static void ib_cache_update(struct ib_device *device, kfree(old_pkey_cache); kfree(tprops); - return; + return 0; err: kfree(pkey_cache); kfree(tprops); + return ret; +} + +static void ib_cache_event_task(struct work_struct *_work) +{ + struct ib_update_work *work = + container_of(_work, struct ib_update_work, work); + int ret; + + /* Before distributing the cache update event, first sync + * the cache. + */ + ret = ib_cache_update(work->event.device, work->event.element.port_num, + work->event.event == IB_EVENT_GID_CHANGE, + work->event.event == IB_EVENT_PKEY_CHANGE, + work->enforce_security); + + /* GID event is notified already for individual GID entries by + * dispatch_gid_change_event(). Hence, notifiy for rest of the + * events. + */ + if (!ret && work->event.event != IB_EVENT_GID_CHANGE) + ib_dispatch_event_clients(&work->event); + + kfree(work); } -static void ib_cache_task(struct work_struct *_work) +static void ib_generic_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); - ib_cache_update(work->device, - work->port_num, - work->enforce_security); + ib_dispatch_event_clients(&work->event); kfree(work); } -static void ib_cache_event(struct ib_event_handler *handler, - struct ib_event *event) +static bool is_cache_update_event(const struct ib_event *event) +{ + return (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_GID_CHANGE); +} + +/** + * ib_dispatch_event - Dispatch an asynchronous event + * @event:Event to dispatch + * + * Low-level drivers must call ib_dispatch_event() to dispatch the + * event to all registered event handlers when an asynchronous event + * occurs. + */ +void ib_dispatch_event(const struct ib_event *event) { struct ib_update_work *work; - if (event->event == IB_EVENT_PORT_ERR || - event->event == IB_EVENT_PORT_ACTIVE || - event->event == IB_EVENT_LID_CHANGE || - event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_SM_CHANGE || - event->event == IB_EVENT_CLIENT_REREGISTER || - event->event == IB_EVENT_GID_CHANGE) { - work = kmalloc(sizeof *work, GFP_ATOMIC); - if (work) { - INIT_WORK(&work->work, ib_cache_task); - work->device = event->device; - work->port_num = event->element.port_num; - if (event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_GID_CHANGE) - work->enforce_security = true; - else - work->enforce_security = false; - - queue_work(ib_wq, &work->work); - } - } + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return; + + if (is_cache_update_event(event)) + INIT_WORK(&work->work, ib_cache_event_task); + else + INIT_WORK(&work->work, ib_generic_event_task); + + work->event = *event; + if (event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_GID_CHANGE) + work->enforce_security = true; + + queue_work(ib_wq, &work->work); } +EXPORT_SYMBOL(ib_dispatch_event); int ib_cache_setup_one(struct ib_device *device) { - int p; + u32 p; int err; - rwlock_init(&device->cache.lock); - - device->cache.ports = - kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1, - sizeof(*device->cache.ports), - GFP_KERNEL); - if (!device->cache.ports) - return -ENOMEM; - err = gid_table_setup_one(device); - if (err) { - kfree(device->cache.ports); - device->cache.ports = NULL; + if (err) return err; - } - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - ib_cache_update(device, p + rdma_start_port(device), true); + rdma_for_each_port (device, p) { + err = ib_cache_update(device, p, true, true, true); + if (err) { + gid_table_cleanup_one(device); + return err; + } + } - INIT_IB_EVENT_HANDLER(&device->cache.event_handler, - device, ib_cache_event); - ib_register_event_handler(&device->cache.event_handler); return 0; } void ib_cache_release_one(struct ib_device *device) { - int p; + u32 p; /* * The release function frees all the cache elements. @@ -1466,23 +1630,20 @@ void ib_cache_release_one(struct ib_device *device) * all the device's resources when the cache could no * longer be accessed. */ - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - kfree(device->cache.ports[p].pkey); + rdma_for_each_port (device, p) + kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); - kfree(device->cache.ports); } void ib_cache_cleanup_one(struct ib_device *device) { - /* The cleanup function unregisters the event handler, - * waits for all in-progress workqueue elements and cleans - * up the GID cache. This function should be called after - * the device was removed from the devices list and all - * clients were removed, so the cache exists but is + /* The cleanup function waits for all in-progress workqueue + * elements and cleans up the GID cache. This function should be + * called after the device was removed from the devices list and + * all clients were removed, so the cache exists but is * non-functional and shouldn't be updated anymore. */ - ib_unregister_event_handler(&device->cache.event_handler); flush_workqueue(ib_wq); gid_table_cleanup_one(device); diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c index 126ac5f99db7..1f037fe01450 100644 --- a/drivers/infiniband/core/cgroup.c +++ b/drivers/infiniband/core/cgroup.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include "core_priv.h" @@ -21,12 +13,11 @@ * Register with the rdma cgroup. Should be called before * exposing rdma device to user space applications to avoid * resource accounting leak. - * Returns 0 on success or otherwise failure code. */ -int ib_device_register_rdmacg(struct ib_device *device) +void ib_device_register_rdmacg(struct ib_device *device) { device->cg_device.name = device->name; - return rdmacg_register_device(&device->cg_device); + rdmacg_register_device(&device->cg_device); } /** diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 37980c7564c0..024df6ee239d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1,36 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2004-2007 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. */ #include <linux/completion.h> @@ -51,12 +25,18 @@ #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> +#include <rdma/ib_sysfs.h> #include "cm_msgs.h" +#include "core_priv.h" +#include "cm_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); +#define CM_DIRECT_RETRY_CTX ((void *) 1UL) +#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */ + static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_NO_QP] = "no QP", [IB_CM_REJ_NO_EEC] = "no EEC", @@ -91,6 +71,8 @@ static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version", [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label", [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label", + [IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED] = + "vendor option is not supported", }; const char *__attribute_const__ ibcm_reject_msg(int reason) @@ -105,8 +87,21 @@ const char *__attribute_const__ ibcm_reject_msg(int reason) } EXPORT_SYMBOL(ibcm_reject_msg); -static void cm_add_one(struct ib_device *device); +struct cm_id_private; +struct cm_work; +static int cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device, void *client_data); +static void cm_process_work(struct cm_id_private *cm_id_priv, + struct cm_work *work); +static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param); +static void cm_issue_dreq(struct cm_id_private *cm_id_priv); +static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len); +static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len); static struct ib_client cm_client = { .name = "cm", @@ -124,12 +119,11 @@ static struct ib_cm { struct rb_root remote_qp_table; struct rb_root remote_id_table; struct rb_root remote_sidr_table; - struct idr local_id_table; + struct xarray local_id_table; + u32 local_id_next; __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; - /* Sync on cm change port state */ - spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ @@ -157,78 +151,34 @@ enum { CM_COUNTER_GROUPS }; -static char const counter_group_names[CM_COUNTER_GROUPS] - [sizeof("cm_rx_duplicates")] = { - "cm_tx_msgs", "cm_tx_retries", - "cm_rx_msgs", "cm_rx_duplicates" -}; - -struct cm_counter_group { - struct kobject obj; - atomic_long_t counter[CM_ATTR_COUNT]; -}; - struct cm_counter_attribute { - struct attribute attr; - int index; -}; - -#define CM_COUNTER_ATTR(_name, _index) \ -struct cm_counter_attribute cm_##_name##_counter_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .index = _index \ -} - -static CM_COUNTER_ATTR(req, CM_REQ_COUNTER); -static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER); -static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER); -static CM_COUNTER_ATTR(rep, CM_REP_COUNTER); -static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER); -static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER); -static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER); -static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER); -static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER); -static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER); -static CM_COUNTER_ATTR(apr, CM_APR_COUNTER); - -static struct attribute *cm_counter_default_attrs[] = { - &cm_req_counter_attr.attr, - &cm_mra_counter_attr.attr, - &cm_rej_counter_attr.attr, - &cm_rep_counter_attr.attr, - &cm_rtu_counter_attr.attr, - &cm_dreq_counter_attr.attr, - &cm_drep_counter_attr.attr, - &cm_sidr_req_counter_attr.attr, - &cm_sidr_rep_counter_attr.attr, - &cm_lap_counter_attr.attr, - &cm_apr_counter_attr.attr, - NULL + struct ib_port_attribute attr; + unsigned short group; + unsigned short index; }; struct cm_port { struct cm_device *cm_dev; struct ib_mad_agent *mad_agent; - struct kobject port_obj; - u8 port_num; - struct list_head cm_priv_prim_list; - struct list_head cm_priv_altr_list; - struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; + struct ib_mad_agent *rep_agent; + u32 port_num; + atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT]; }; struct cm_device { + struct kref kref; struct list_head list; + rwlock_t mad_agent_lock; struct ib_device *ib_device; - struct device *device; u8 ack_delay; int going_down; - struct cm_port *port[0]; + struct cm_port *port[]; }; struct cm_av { struct cm_port *port; - union ib_gid dgid; struct rdma_ah_attr ah_attr; + u16 dlid_datapath; u16 pkey_index; u8 timeout; }; @@ -241,11 +191,11 @@ struct cm_work { __be32 local_id; /* Established / timewait */ __be32 remote_id; struct ib_cm_event cm_event; - struct sa_path_rec path[0]; + struct sa_path_rec path[]; }; struct cm_timewait_info { - struct cm_work work; /* Must be first. */ + struct cm_work work; struct list_head list; struct rb_node remote_qp_node; struct rb_node remote_id_node; @@ -260,12 +210,15 @@ struct cm_id_private { struct rb_node service_node; struct rb_node sidr_id_node; + u32 sidr_slid; spinlock_t lock; /* Do not acquire inside cm.lock */ struct completion comp; - atomic_t refcount; + refcount_t refcount; /* Number of clients sharing this ib_cm_id. Only valid for listeners. - * Protected by the cm.lock spinlock. */ + * Protected by the cm.lock spinlock. + */ int listen_sharecount; + struct rcu_head rcu; struct ib_mad_send_buf *msg; struct cm_timewait_info *timewait_info; @@ -285,101 +238,155 @@ struct cm_id_private { __be16 pkey; u8 private_data_len; u8 max_cm_retries; - u8 peer_to_peer; u8 responder_resources; u8 initiator_depth; u8 retry_count; u8 rnr_retry_count; - u8 service_timeout; u8 target_ack_delay; - struct list_head prim_list; - struct list_head altr_list; - /* Indicates that the send port mad is registered and av is set */ - int prim_send_port_not_ready; - int altr_send_port_not_ready; - struct list_head work_list; atomic_t work_count; + + struct rdma_ucm_ece ece; }; +static void cm_dev_release(struct kref *kref) +{ + struct cm_device *cm_dev = container_of(kref, struct cm_device, kref); + u32 i; + + rdma_for_each_port(cm_dev->ib_device, i) + kfree(cm_dev->port[i - 1]); + + kfree(cm_dev); +} + +static void cm_device_put(struct cm_device *cm_dev) +{ + kref_put(&cm_dev->kref, cm_dev_release); +} + static void cm_work_handler(struct work_struct *work); static inline void cm_deref_id(struct cm_id_private *cm_id_priv) { - if (atomic_dec_and_test(&cm_id_priv->refcount)) + if (refcount_dec_and_test(&cm_id_priv->refcount)) complete(&cm_id_priv->comp); } -static int cm_alloc_msg(struct cm_id_private *cm_id_priv, - struct ib_mad_send_buf **msg) +static struct ib_mad_send_buf * +cm_alloc_msg_agent(struct cm_id_private *cm_id_priv, bool rep_agent) { struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; - struct cm_av *av; - unsigned long flags, flags2; - int ret = 0; - /* don't let the port to be released till the agent is down */ - spin_lock_irqsave(&cm.state_lock, flags2); - spin_lock_irqsave(&cm.lock, flags); - if (!cm_id_priv->prim_send_port_not_ready) - av = &cm_id_priv->av; - else if (!cm_id_priv->altr_send_port_not_ready && - (cm_id_priv->alt_av.port)) - av = &cm_id_priv->alt_av; - else { - pr_info("%s: not valid CM id\n", __func__); - ret = -ENODEV; - spin_unlock_irqrestore(&cm.lock, flags); - goto out; - } - spin_unlock_irqrestore(&cm.lock, flags); - /* Make sure the port haven't released the mad yet */ - mad_agent = cm_id_priv->av.port->mad_agent; + lockdep_assert_held(&cm_id_priv->lock); + + if (!cm_id_priv->av.port) + return ERR_PTR(-EINVAL); + + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + mad_agent = rep_agent ? cm_id_priv->av.port->rep_agent : + cm_id_priv->av.port->mad_agent; if (!mad_agent) { - pr_info("%s: not a valid MAD agent\n", __func__); - ret = -ENODEV; + m = ERR_PTR(-EINVAL); goto out; } - ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr, 0); + + ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0); if (IS_ERR(ah)) { - ret = PTR_ERR(ah); + m = ERR_CAST(ah); goto out; } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, - av->pkey_index, + cm_id_priv->av.pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { rdma_destroy_ah(ah, 0); - ret = PTR_ERR(m); goto out; } - /* Timeout set by caller if response is expected. */ m->ah = ah; - m->retries = cm_id_priv->max_cm_retries; - - atomic_inc(&cm_id_priv->refcount); - m->context[0] = cm_id_priv; - *msg = m; out: - spin_unlock_irqrestore(&cm.state_lock, flags2); - return ret; + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + return m; +} + +static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) +{ + return cm_alloc_msg_agent(cm_id_priv, false); +} + +static void cm_free_msg(struct ib_mad_send_buf *msg) +{ + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + ib_free_send_mad(msg); +} + +static struct ib_mad_send_buf * +cm_alloc_priv_msg_rep(struct cm_id_private *cm_id_priv, enum ib_cm_state state, + bool rep_agent) +{ + struct ib_mad_send_buf *msg; + + lockdep_assert_held(&cm_id_priv->lock); + + msg = cm_alloc_msg_agent(cm_id_priv, rep_agent); + if (IS_ERR(msg)) + return msg; + + cm_id_priv->msg = msg; + refcount_inc(&cm_id_priv->refcount); + msg->context[0] = cm_id_priv; + msg->context[1] = (void *) (unsigned long) state; + + msg->retries = cm_id_priv->max_cm_retries; + msg->timeout_ms = cm_id_priv->timeout_ms; + + return msg; } -static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, - struct ib_mad_recv_wc *mad_recv_wc) +static struct ib_mad_send_buf * +cm_alloc_priv_msg(struct cm_id_private *cm_id_priv, enum ib_cm_state state) { - return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, - 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, - GFP_ATOMIC, - IB_MGMT_BASE_VERSION); + return cm_alloc_priv_msg_rep(cm_id_priv, state, false); +} + +static void cm_free_priv_msg(struct ib_mad_send_buf *msg) +{ + struct cm_id_private *cm_id_priv = msg->context[0]; + + lockdep_assert_held(&cm_id_priv->lock); + + if (!WARN_ON(cm_id_priv->msg != msg)) + cm_id_priv->msg = NULL; + + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + cm_deref_id(cm_id_priv); + ib_free_send_mad(msg); +} + +static struct ib_mad_send_buf * +cm_alloc_response_msg_no_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + bool direct_retry) +{ + struct ib_mad_send_buf *m; + + m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, IB_MGMT_BASE_VERSION); + if (!IS_ERR(m)) + m->context[0] = direct_retry ? CM_DIRECT_RETRY_CTX : NULL; + + return m; } static int cm_create_response_msg_ah(struct cm_port *port, @@ -397,29 +404,21 @@ static int cm_create_response_msg_ah(struct cm_port *port, return 0; } -static void cm_free_msg(struct ib_mad_send_buf *msg) -{ - if (msg->ah) - rdma_destroy_ah(msg->ah, 0); - if (msg->context[0]) - cm_deref_id(msg->context[0]); - ib_free_send_mad(msg); -} - static int cm_alloc_response_msg(struct cm_port *port, struct ib_mad_recv_wc *mad_recv_wc, + bool direct_retry, struct ib_mad_send_buf **msg) { struct ib_mad_send_buf *m; int ret; - m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); + m = cm_alloc_response_msg_no_ah(port, mad_recv_wc, direct_retry); if (IS_ERR(m)) return PTR_ERR(m); ret = cm_create_response_msg_ah(port, mad_recv_wc, m); if (ret) { - cm_free_msg(m); + ib_free_send_mad(m); return ret; } @@ -427,8 +426,7 @@ static int cm_alloc_response_msg(struct cm_port *port, return 0; } -static void * cm_copy_private_data(const void *private_data, - u8 private_data_len) +static void *cm_copy_private_data(const void *private_data, u8 private_data_len) { void *data; @@ -452,62 +450,38 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } -static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, - struct ib_grh *grh, struct cm_av *av) +static void cm_set_av_port(struct cm_av *av, struct cm_port *port) { - struct rdma_ah_attr new_ah_attr; - int ret; + struct cm_port *old_port = av->port; - av->port = port; - av->pkey_index = wc->pkey_index; + if (old_port == port) + return; - /* - * av->ah_attr might be initialized based on past wc during incoming - * connect request or while sending out connect request. So initialize - * a new ah_attr on stack. If initialization fails, old ah_attr is - * used for sending any responses. If initialization is successful, - * than new ah_attr is used by overwriting old one. - */ - ret = ib_init_ah_attr_from_wc(port->cm_dev->ib_device, - port->port_num, wc, - grh, &new_ah_attr); - if (ret) - return ret; + av->port = port; + if (old_port) + cm_device_put(old_port->cm_dev); + if (port) + kref_get(&port->cm_dev->kref); +} - rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); - return 0; +static void cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, + struct rdma_ah_attr *ah_attr, struct cm_av *av) +{ + cm_set_av_port(av, port); + av->pkey_index = wc->pkey_index; + rdma_move_ah_attr(&av->ah_attr, ah_attr); } static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, struct ib_grh *grh, struct cm_av *av) { - av->port = port; + cm_set_av_port(av, port); av->pkey_index = wc->pkey_index; return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, port->port_num, wc, grh, &av->ah_attr); } -static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv, - struct cm_av *av, - struct cm_port *port) -{ - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&cm.lock, flags); - - if (&cm_id_priv->av == av) - list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); - else if (&cm_id_priv->alt_av == av) - list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); - else - ret = -EINVAL; - - spin_unlock_irqrestore(&cm.lock, flags); - return ret; -} - static struct cm_port * get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) { @@ -551,8 +525,7 @@ get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) static int cm_init_av_by_path(struct sa_path_rec *path, const struct ib_gid_attr *sgid_attr, - struct cm_av *av, - struct cm_id_private *cm_id_priv) + struct cm_av *av) { struct rdma_ah_attr new_ah_attr; struct cm_device *cm_dev; @@ -569,7 +542,7 @@ static int cm_init_av_by_path(struct sa_path_rec *path, if (ret) return ret; - av->port = port; + cm_set_av_port(av, port); /* * av->ah_attr might be initialized based on wc or during @@ -586,64 +559,41 @@ static int cm_init_av_by_path(struct sa_path_rec *path, return ret; av->timeout = path->packet_life_time + 1; - - ret = add_cm_id_to_port_list(cm_id_priv, av, port); - if (ret) { - rdma_destroy_ah_attr(&new_ah_attr); - return ret; - } rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; } -static int cm_alloc_id(struct cm_id_private *cm_id_priv) +/* Move av created by cm_init_av_by_path(), so av.dgid is not moved */ +static void cm_move_av_from_path(struct cm_av *dest, struct cm_av *src) { - unsigned long flags; - int id; - - idr_preload(GFP_KERNEL); - spin_lock_irqsave(&cm.lock, flags); - - id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); - - spin_unlock_irqrestore(&cm.lock, flags); - idr_preload_end(); - - cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; - return id < 0 ? id : 0; + cm_set_av_port(dest, src->port); + cm_set_av_port(src, NULL); + dest->pkey_index = src->pkey_index; + rdma_move_ah_attr(&dest->ah_attr, &src->ah_attr); + dest->timeout = src->timeout; } -static void cm_free_id(__be32 local_id) +static void cm_destroy_av(struct cm_av *av) { - spin_lock_irq(&cm.lock); - idr_remove(&cm.local_id_table, - (__force int) (local_id ^ cm.random_id_operand)); - spin_unlock_irq(&cm.lock); + rdma_destroy_ah_attr(&av->ah_attr); + cm_set_av_port(av, NULL); } -static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id) +static u32 cm_local_id(__be32 local_id) { - struct cm_id_private *cm_id_priv; - - cm_id_priv = idr_find(&cm.local_id_table, - (__force int) (local_id ^ cm.random_id_operand)); - if (cm_id_priv) { - if (cm_id_priv->id.remote_id == remote_id) - atomic_inc(&cm_id_priv->refcount); - else - cm_id_priv = NULL; - } - - return cm_id_priv; + return (__force u32) (local_id ^ cm.random_id_operand); } -static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id) +static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; - spin_lock_irq(&cm.lock); - cm_id_priv = cm_get_id(local_id, remote_id); - spin_unlock_irq(&cm.lock); + rcu_read_lock(); + cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id)); + if (!cm_id_priv || cm_id_priv->id.remote_id != remote_id || + !refcount_inc_not_zero(&cm_id_priv->refcount)) + cm_id_priv = NULL; + rcu_read_unlock(); return cm_id_priv; } @@ -673,22 +623,25 @@ static int be64_gt(__be64 a, __be64 b) return (__force u64) a > (__force u64) b; } -static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) +/* + * Inserts a new cm_id_priv into the listen_service_table. Returns cm_id_priv + * if the new ID was inserted, NULL if it could not be inserted due to a + * collision, or the existing cm_id_priv ready for shared usage. + */ +static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv, + ib_cm_handler shared_handler) { struct rb_node **link = &cm.listen_service_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; - __be64 service_mask = cm_id_priv->id.service_mask; + unsigned long flags; + spin_lock_irqsave(&cm.lock, flags); while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); - if ((cur_cm_id_priv->id.service_mask & service_id) == - (service_mask & cur_cm_id_priv->id.service_id) && - (cm_id_priv->id.device == cur_cm_id_priv->id.device)) - return cur_cm_id_priv; if (cm_id_priv->id.device < cur_cm_id_priv->id.device) link = &(*link)->rb_left; @@ -698,26 +651,38 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) link = &(*link)->rb_left; else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) link = &(*link)->rb_right; - else - link = &(*link)->rb_right; + else { + /* + * Sharing an ib_cm_id with different handlers is not + * supported + */ + if (cur_cm_id_priv->id.cm_handler != shared_handler || + cur_cm_id_priv->id.context || + WARN_ON(!cur_cm_id_priv->id.cm_handler)) { + spin_unlock_irqrestore(&cm.lock, flags); + return NULL; + } + refcount_inc(&cur_cm_id_priv->refcount); + cur_cm_id_priv->listen_sharecount++; + spin_unlock_irqrestore(&cm.lock, flags); + return cur_cm_id_priv; + } } + cm_id_priv->listen_sharecount++; rb_link_node(&cm_id_priv->service_node, parent, link); rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table); - return NULL; + spin_unlock_irqrestore(&cm.lock, flags); + return cm_id_priv; } -static struct cm_id_private * cm_find_listen(struct ib_device *device, - __be64 service_id) +static struct cm_id_private *cm_find_listen(struct ib_device *device, + __be64 service_id) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); - if ((cm_id_priv->id.service_mask & service_id) == - cm_id_priv->id.service_id && - (cm_id_priv->id.device == device)) - return cm_id_priv; if (device < cm_id_priv->id.device) node = node->rb_left; @@ -727,14 +692,16 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device, node = node->rb_left; else if (be64_gt(service_id, cm_id_priv->id.service_id)) node = node->rb_right; - else - node = node->rb_right; + else { + refcount_inc(&cm_id_priv->refcount); + return cm_id_priv; + } } return NULL; } -static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info - *timewait_info) +static struct cm_timewait_info * +cm_insert_remote_id(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_id_table.rb_node; struct rb_node *parent = NULL; @@ -763,12 +730,14 @@ static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info return NULL; } -static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, - __be32 remote_id) +static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid, + __be32 remote_id) { struct rb_node *node = cm.remote_id_table.rb_node; struct cm_timewait_info *timewait_info; + struct cm_id_private *res = NULL; + spin_lock_irq(&cm.lock); while (node) { timewait_info = rb_entry(node, struct cm_timewait_info, remote_id_node); @@ -780,14 +749,18 @@ static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, node = node->rb_left; else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_right; - else - return timewait_info; + else { + res = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + break; + } } - return NULL; + spin_unlock_irq(&cm.lock); + return res; } -static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info - *timewait_info) +static struct cm_timewait_info * +cm_insert_remote_qpn(struct cm_timewait_info *timewait_info) { struct rb_node **link = &cm.remote_qp_table.rb_node; struct rb_node *parent = NULL; @@ -816,13 +789,12 @@ static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info return NULL; } -static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private - *cm_id_priv) +static struct cm_id_private * +cm_insert_remote_sidr(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.remote_sidr_table.rb_node; struct rb_node *parent = NULL; struct cm_id_private *cur_cm_id_priv; - union ib_gid *port_gid = &cm_id_priv->av.dgid; __be32 remote_id = cm_id_priv->id.remote_id; while (*link) { @@ -834,12 +806,9 @@ static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) link = &(*link)->rb_right; else { - int cmp; - cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid, - sizeof *port_gid); - if (cmp < 0) + if (cur_cm_id_priv->sidr_slid < cm_id_priv->sidr_slid) link = &(*link)->rb_left; - else if (cmp > 0) + else if (cur_cm_id_priv->sidr_slid > cm_id_priv->sidr_slid) link = &(*link)->rb_right; else return cur_cm_id_priv; @@ -850,21 +819,12 @@ static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private return NULL; } -static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv, - enum ib_cm_sidr_status status) -{ - struct ib_cm_sidr_rep_param param; - - memset(¶m, 0, sizeof param); - param.status = status; - ib_send_cm_sidr_rep(&cm_id_priv->id, ¶m); -} - -struct ib_cm_id *ib_create_cm_id(struct ib_device *device, - ib_cm_handler cm_handler, - void *context) +static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device, + ib_cm_handler cm_handler, + void *context) { struct cm_id_private *cm_id_priv; + u32 id; int ret; cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL); @@ -876,26 +836,54 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device, cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.remote_cm_qpn = 1; - ret = cm_alloc_id(cm_id_priv); - if (ret) - goto error; + RB_CLEAR_NODE(&cm_id_priv->service_node); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); - INIT_LIST_HEAD(&cm_id_priv->prim_list); - INIT_LIST_HEAD(&cm_id_priv->altr_list); atomic_set(&cm_id_priv->work_count, -1); - atomic_set(&cm_id_priv->refcount, 1); - return &cm_id_priv->id; + refcount_set(&cm_id_priv->refcount, 1); + + ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b, + &cm.local_id_next, GFP_KERNEL); + if (ret < 0) + goto error; + cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; + + return cm_id_priv; error: kfree(cm_id_priv); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); +} + +/* + * Make the ID visible to the MAD handlers and other threads that use the + * xarray. + */ +static void cm_finalize_id(struct cm_id_private *cm_id_priv) +{ + xa_store(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id), + cm_id_priv, GFP_ATOMIC); +} + +struct ib_cm_id *ib_create_cm_id(struct ib_device *device, + ib_cm_handler cm_handler, + void *context) +{ + struct cm_id_private *cm_id_priv; + + cm_id_priv = cm_alloc_id_priv(device, cm_handler, context); + if (IS_ERR(cm_id_priv)) + return ERR_CAST(cm_id_priv); + + cm_finalize_id(cm_id_priv); + return &cm_id_priv->id; } EXPORT_SYMBOL(ib_create_cm_id); -static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv) +static struct cm_work *cm_dequeue_work(struct cm_id_private *cm_id_priv) { struct cm_work *work; @@ -914,6 +902,36 @@ static void cm_free_work(struct cm_work *work) kfree(work); } +static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv, + struct cm_work *work) + __releases(&cm_id_priv->lock) +{ + bool immediate; + + /* + * To deliver the event to the user callback we have the drop the + * spinlock, however, we need to ensure that the user callback is single + * threaded and receives events in the temporal order. If there are + * already events being processed then thread new events onto a list, + * the thread currently processing will pick them up. + */ + immediate = atomic_inc_and_test(&cm_id_priv->work_count); + if (!immediate) { + list_add_tail(&work->list, &cm_id_priv->work_list); + /* + * This routine always consumes incoming reference. Once queued + * to the work_list then a reference is held by the thread + * currently running cm_process_work() and this reference is not + * needed. + */ + cm_deref_id(cm_id_priv); + } + spin_unlock_irq(&cm_id_priv->lock); + + if (immediate) + cm_process_work(cm_id_priv, work); +} + static inline int cm_convert_to_ms(int iba_time) { /* approximate conversion to ms from 4.096us x 2^iba_time */ @@ -939,8 +957,10 @@ static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) return min(31, ack_timeout); } -static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) +static void cm_remove_remote(struct cm_id_private *cm_id_priv) { + struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info; + if (timewait_info->inserted_remote_id) { rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); timewait_info->inserted_remote_id = 0; @@ -952,7 +972,7 @@ static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) } } -static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) +static struct cm_timewait_info *cm_create_timewait_info(__be32 local_id) { struct cm_timewait_info *timewait_info; @@ -972,12 +992,14 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) unsigned long flags; struct cm_device *cm_dev; + lockdep_assert_held(&cm_id_priv->lock); + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); if (!cm_dev) return; spin_lock_irqsave(&cm.lock, flags); - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); spin_unlock_irqrestore(&cm.lock, flags); @@ -996,6 +1018,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) msecs_to_jiffies(wait_time)); spin_unlock_irqrestore(&cm.lock, flags); + /* + * The timewait_info is converted into a work and gets freed during + * cm_free_work() in cm_timewait_handler(). + */ + BUILD_BUG_ON(offsetof(struct cm_timewait_info, work) != 0); cm_id_priv->timewait_info = NULL; } @@ -1003,122 +1030,157 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) { unsigned long flags; + lockdep_assert_held(&cm_id_priv->lock); + cm_id_priv->id.state = IB_CM_IDLE; if (cm_id_priv->timewait_info) { spin_lock_irqsave(&cm.lock, flags); - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); spin_unlock_irqrestore(&cm.lock, flags); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; } } +static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id, + enum ib_cm_state old_state) +{ + struct cm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + pr_err_ratelimited("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__, + cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount)); +} + static void cm_destroy_id(struct ib_cm_id *cm_id, int err) { struct cm_id_private *cm_id_priv; + enum ib_cm_state old_state; + unsigned long timeout; struct cm_work *work; + int ret; cm_id_priv = container_of(cm_id, struct cm_id_private, id); -retest: spin_lock_irq(&cm_id_priv->lock); + old_state = cm_id->state; +retest: switch (cm_id->state) { case IB_CM_LISTEN: - spin_unlock_irq(&cm_id_priv->lock); - - spin_lock_irq(&cm.lock); + spin_lock(&cm.lock); if (--cm_id_priv->listen_sharecount > 0) { /* The id is still shared. */ + WARN_ON(refcount_read(&cm_id_priv->refcount) == 1); + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); cm_deref_id(cm_id_priv); - spin_unlock_irq(&cm.lock); return; } + cm_id->state = IB_CM_IDLE; rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); - spin_unlock_irq(&cm.lock); + RB_CLEAR_NODE(&cm_id_priv->service_node); + spin_unlock(&cm.lock); break; case IB_CM_SIDR_REQ_SENT: cm_id->state = IB_CM_IDLE; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - spin_unlock_irq(&cm_id_priv->lock); + ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_SIDR_REQ_RCVD: - spin_unlock_irq(&cm_id_priv->lock); - cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); - spin_lock_irq(&cm.lock); - if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) - rb_erase(&cm_id_priv->sidr_id_node, - &cm.remote_sidr_table); - spin_unlock_irq(&cm.lock); + cm_send_sidr_rep_locked(cm_id_priv, + &(struct ib_cm_sidr_rep_param){ + .status = IB_SIDR_REJECT }); + /* cm_send_sidr_rep_locked will not move to IDLE if it fails */ + cm_id->state = IB_CM_IDLE; break; case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT, - &cm_id_priv->id.device->node_guid, - sizeof cm_id_priv->id.device->node_guid, - NULL, 0); + ib_cancel_mad(cm_id_priv->msg); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT, + &cm_id_priv->id.device->node_guid, + sizeof(cm_id_priv->id.device->node_guid), + NULL, 0); break; case IB_CM_REQ_RCVD: if (err == -ENOMEM) { /* Do not reject to allow future retries. */ cm_reset_to_idle(cm_id_priv); - spin_unlock_irq(&cm_id_priv->lock); } else { - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, - NULL, 0, NULL, 0); + cm_send_rej_locked(cm_id_priv, + IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, + NULL, 0); } break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - /* Fall through */ + ib_cancel_mad(cm_id_priv->msg); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, NULL, 0); + goto retest; case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, - NULL, 0, NULL, 0); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, NULL, 0); break; case IB_CM_ESTABLISHED: - spin_unlock_irq(&cm_id_priv->lock); - if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) + if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) { + cm_id->state = IB_CM_IDLE; break; - ib_send_cm_dreq(cm_id, NULL, 0); + } + cm_issue_dreq(cm_id_priv); + cm_enter_timewait(cm_id_priv); goto retest; case IB_CM_DREQ_SENT: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ib_cancel_mad(cm_id_priv->msg); cm_enter_timewait(cm_id_priv); - spin_unlock_irq(&cm_id_priv->lock); - break; + goto retest; case IB_CM_DREQ_RCVD: - spin_unlock_irq(&cm_id_priv->lock); - ib_send_cm_drep(cm_id, NULL, 0); + cm_send_drep_locked(cm_id_priv, NULL, 0); + WARN_ON(cm_id->state != IB_CM_TIMEWAIT); + goto retest; + case IB_CM_TIMEWAIT: + /* + * The cm_acquire_id in cm_timewait_handler will stop working + * once we do xa_erase below, so just move to idle here for + * consistency. + */ + cm_id->state = IB_CM_IDLE; break; - default: - spin_unlock_irq(&cm_id_priv->lock); + case IB_CM_IDLE: break; } + WARN_ON(cm_id->state != IB_CM_IDLE); - spin_lock_irq(&cm.lock); - if (!list_empty(&cm_id_priv->altr_list) && - (!cm_id_priv->altr_send_port_not_ready)) - list_del(&cm_id_priv->altr_list); - if (!list_empty(&cm_id_priv->prim_list) && - (!cm_id_priv->prim_send_port_not_ready)) - list_del(&cm_id_priv->prim_list); - spin_unlock_irq(&cm.lock); + spin_lock(&cm.lock); + /* Required for cleanup paths related cm_req_handler() */ + if (cm_id_priv->timewait_info) { + cm_remove_remote(cm_id_priv); + kfree(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; + } + + WARN_ON(cm_id_priv->listen_sharecount); + WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node)); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); - cm_free_id(cm_id->local_id); + xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id)); cm_deref_id(cm_id_priv); - wait_for_completion(&cm_id_priv->comp); + timeout = msecs_to_jiffies((cm_id_priv->max_cm_retries * cm_id_priv->timeout_ms * 5) / 4); + do { + ret = wait_for_completion_timeout(&cm_id_priv->comp, timeout); + if (!ret) /* timeout happened */ + cm_destroy_id_wait_timeout(cm_id, old_state); + } while (!ret); + while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); - rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr); - rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr); + cm_destroy_av(&cm_id_priv->av); + cm_destroy_av(&cm_id_priv->alt_av); kfree(cm_id_priv->private_data); - kfree(cm_id_priv); + kfree_rcu(cm_id_priv, rcu); } void ib_destroy_cm_id(struct ib_cm_id *cm_id) @@ -1127,70 +1189,63 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id) } EXPORT_SYMBOL(ib_destroy_cm_id); +static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id) +{ + if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && + (service_id != IB_CM_ASSIGN_SERVICE_ID)) + return -EINVAL; + + if (service_id == IB_CM_ASSIGN_SERVICE_ID) + cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++); + else + cm_id_priv->id.service_id = service_id; + + return 0; +} + /** - * __ib_cm_listen - Initiates listening on the specified service ID for + * ib_cm_listen - Initiates listening on the specified service ID for * connection and service ID resolution requests. * @cm_id: Connection identifier associated with the listen request. * @service_id: Service identifier matched against incoming connection * and service ID resolution requests. The service ID should be specified * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will * assign a service ID to the caller. - * @service_mask: Mask applied to service ID used to listen across a - * range of service IDs. If set to 0, the service ID is matched - * exactly. This parameter is ignored if %service_id is set to - * IB_CM_ASSIGN_SERVICE_ID. */ -static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, - __be64 service_mask) +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id) { - struct cm_id_private *cm_id_priv, *cur_cm_id_priv; - int ret = 0; - - service_mask = service_mask ? service_mask : ~cpu_to_be64(0); - service_id &= service_mask; - if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && - (service_id != IB_CM_ASSIGN_SERVICE_ID)) - return -EINVAL; - - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - if (cm_id->state != IB_CM_IDLE) - return -EINVAL; - - cm_id->state = IB_CM_LISTEN; - ++cm_id_priv->listen_sharecount; + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; - if (service_id == IB_CM_ASSIGN_SERVICE_ID) { - cm_id->service_id = cpu_to_be64(cm.listen_service_id++); - cm_id->service_mask = ~cpu_to_be64(0); - } else { - cm_id->service_id = service_id; - cm_id->service_mask = service_mask; + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->id.state != IB_CM_IDLE) { + ret = -EINVAL; + goto out; } - cur_cm_id_priv = cm_insert_listen(cm_id_priv); - if (cur_cm_id_priv) { - cm_id->state = IB_CM_IDLE; - --cm_id_priv->listen_sharecount; + ret = cm_init_listen(cm_id_priv, service_id); + if (ret) + goto out; + + if (!cm_insert_listen(cm_id_priv, NULL)) { ret = -EBUSY; + goto out; } - return ret; -} -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&cm.lock, flags); - ret = __ib_cm_listen(cm_id, service_id, service_mask); - spin_unlock_irqrestore(&cm.lock, flags); + cm_id_priv->id.state = IB_CM_LISTEN; + ret = 0; +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_cm_listen); /** - * Create a new listening ib_cm_id and listen on the given service ID. + * ib_cm_insert_listen - Create a new listening ib_cm_id and listen on + * the given service ID. * * If there's an existing ID listening on that same device and service ID, * return it. @@ -1209,59 +1264,57 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, ib_cm_handler cm_handler, __be64 service_id) { + struct cm_id_private *listen_id_priv; struct cm_id_private *cm_id_priv; - struct ib_cm_id *cm_id; - unsigned long flags; int err = 0; /* Create an ID in advance, since the creation may sleep */ - cm_id = ib_create_cm_id(device, cm_handler, NULL); - if (IS_ERR(cm_id)) - return cm_id; + cm_id_priv = cm_alloc_id_priv(device, cm_handler, NULL); + if (IS_ERR(cm_id_priv)) + return ERR_CAST(cm_id_priv); - spin_lock_irqsave(&cm.lock, flags); + err = cm_init_listen(cm_id_priv, service_id); + if (err) { + ib_destroy_cm_id(&cm_id_priv->id); + return ERR_PTR(err); + } - if (service_id == IB_CM_ASSIGN_SERVICE_ID) - goto new_id; - - /* Find an existing ID */ - cm_id_priv = cm_find_listen(device, service_id); - if (cm_id_priv) { - if (cm_id->cm_handler != cm_handler || cm_id->context) { - /* Sharing an ib_cm_id with different handlers is not - * supported */ - spin_unlock_irqrestore(&cm.lock, flags); + spin_lock_irq(&cm_id_priv->lock); + listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler); + if (listen_id_priv != cm_id_priv) { + spin_unlock_irq(&cm_id_priv->lock); + ib_destroy_cm_id(&cm_id_priv->id); + if (!listen_id_priv) return ERR_PTR(-EINVAL); - } - atomic_inc(&cm_id_priv->refcount); - ++cm_id_priv->listen_sharecount; - spin_unlock_irqrestore(&cm.lock, flags); - - ib_destroy_cm_id(cm_id); - cm_id = &cm_id_priv->id; - return cm_id; + return &listen_id_priv->id; } + cm_id_priv->id.state = IB_CM_LISTEN; + spin_unlock_irq(&cm_id_priv->lock); -new_id: - /* Use newly created ID */ - err = __ib_cm_listen(cm_id, service_id, 0); - - spin_unlock_irqrestore(&cm.lock, flags); + /* + * A listen ID does not need to be in the xarray since it does not + * receive mads, is not placed in the remote_id or remote_qpn rbtree, + * and does not enter timewait. + */ - if (err) { - ib_destroy_cm_id(cm_id); - return ERR_PTR(err); - } - return cm_id; + return &cm_id_priv->id; } EXPORT_SYMBOL(ib_cm_insert_listen); static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) { - u64 hi_tid, low_tid; + u64 hi_tid = 0, low_tid; + + lockdep_assert_held(&cm_id_priv->lock); + + low_tid = (u64)cm_id_priv->id.local_id; + if (!cm_id_priv->av.port) + return cpu_to_be64(low_tid); - hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32; - low_tid = (u64)cm_id_priv->id.local_id; + read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + if (cm_id_priv->av.port->mad_agent) + hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32; + read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); return cpu_to_be64(hi_tid | low_tid); } @@ -1276,6 +1329,13 @@ static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, hdr->tid = tid; } +static void cm_format_mad_ece_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, + __be64 tid, u32 attr_mod) +{ + cm_format_mad_hdr(hdr, attr_id, tid); + hdr->attr_mod = cpu_to_be32(attr_mod); +} + static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) @@ -1283,62 +1343,88 @@ static void cm_format_req(struct cm_req_msg *req_msg, struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; bool pri_ext = false; + __be16 lid; if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) pri_ext = opa_is_extended_lid(pri_path->opa.dlid, pri_path->opa.slid); - cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, - cm_form_tid(cm_id_priv)); - - req_msg->local_comm_id = cm_id_priv->id.local_id; - req_msg->service_id = param->service_id; - req_msg->local_ca_guid = cm_id_priv->id.device->node_guid; - cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num)); - cm_req_set_init_depth(req_msg, param->initiator_depth); - cm_req_set_remote_resp_timeout(req_msg, - param->remote_cm_response_timeout); + cm_format_mad_ece_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, + cm_form_tid(cm_id_priv), param->ece.attr_mod); + + IBA_SET(CM_REQ_LOCAL_COMM_ID, req_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REQ_SERVICE_ID, req_msg, be64_to_cpu(param->service_id)); + IBA_SET(CM_REQ_LOCAL_CA_GUID, req_msg, + be64_to_cpu(cm_id_priv->id.device->node_guid)); + IBA_SET(CM_REQ_LOCAL_QPN, req_msg, param->qp_num); + IBA_SET(CM_REQ_INITIATOR_DEPTH, req_msg, param->initiator_depth); + IBA_SET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg, + param->remote_cm_response_timeout); cm_req_set_qp_type(req_msg, param->qp_type); - cm_req_set_flow_ctrl(req_msg, param->flow_control); - cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn)); - cm_req_set_local_resp_timeout(req_msg, - param->local_cm_response_timeout); - req_msg->pkey = param->primary_path->pkey; - cm_req_set_path_mtu(req_msg, param->primary_path->mtu); - cm_req_set_max_cm_retries(req_msg, param->max_cm_retries); + IBA_SET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg, param->flow_control); + IBA_SET(CM_REQ_STARTING_PSN, req_msg, param->starting_psn); + IBA_SET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg, + param->local_cm_response_timeout); + IBA_SET(CM_REQ_PARTITION_KEY, req_msg, + be16_to_cpu(param->primary_path->pkey)); + IBA_SET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg, + param->primary_path->mtu); + IBA_SET(CM_REQ_MAX_CM_RETRIES, req_msg, param->max_cm_retries); if (param->qp_type != IB_QPT_XRC_INI) { - cm_req_set_resp_res(req_msg, param->responder_resources); - cm_req_set_retry_count(req_msg, param->retry_count); - cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); - cm_req_set_srq(req_msg, param->srq); + IBA_SET(CM_REQ_RESPONDER_RESOURCES, req_msg, + param->responder_resources); + IBA_SET(CM_REQ_RETRY_COUNT, req_msg, param->retry_count); + IBA_SET(CM_REQ_RNR_RETRY_COUNT, req_msg, + param->rnr_retry_count); + IBA_SET(CM_REQ_SRQ, req_msg, param->srq); } - req_msg->primary_local_gid = pri_path->sgid; - req_msg->primary_remote_gid = pri_path->dgid; + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) = + pri_path->sgid; + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) = + pri_path->dgid; if (pri_ext) { - req_msg->primary_local_gid.global.interface_id - = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); - req_msg->primary_remote_gid.global.interface_id - = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); + IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); + IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); } if (pri_path->hop_limit <= 1) { - req_msg->primary_local_lid = pri_ext ? 0 : - htons(ntohl(sa_path_get_slid(pri_path))); - req_msg->primary_remote_lid = pri_ext ? 0 : - htons(ntohl(sa_path_get_dlid(pri_path))); + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(pri_ext ? 0 : + htons(ntohl(sa_path_get_slid( + pri_path))))); + IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, + be16_to_cpu(pri_ext ? 0 : + htons(ntohl(sa_path_get_dlid( + pri_path))))); } else { + + if (param->primary_path_inbound) { + lid = param->primary_path_inbound->ib.dlid; + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(lid)); + } else + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); + /* Work-around until there's a way to obtain remote LID info */ - req_msg->primary_local_lid = IB_LID_PERMISSIVE; - req_msg->primary_remote_lid = IB_LID_PERMISSIVE; + IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); } - cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); - cm_req_set_primary_packet_rate(req_msg, pri_path->rate); - req_msg->primary_traffic_class = pri_path->traffic_class; - req_msg->primary_hop_limit = pri_path->hop_limit; - cm_req_set_primary_sl(req_msg, pri_path->sl); - cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1)); - cm_req_set_primary_local_ack_timeout(req_msg, + IBA_SET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg, + be32_to_cpu(pri_path->flow_label)); + IBA_SET(CM_REQ_PRIMARY_PACKET_RATE, req_msg, pri_path->rate); + IBA_SET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg, pri_path->traffic_class); + IBA_SET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg, pri_path->hop_limit); + IBA_SET(CM_REQ_PRIMARY_SL, req_msg, pri_path->sl); + IBA_SET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg, + (pri_path->hop_limit <= 1)); + IBA_SET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, pri_path->packet_life_time)); @@ -1349,46 +1435,60 @@ static void cm_format_req(struct cm_req_msg *req_msg, alt_ext = opa_is_extended_lid(alt_path->opa.dlid, alt_path->opa.slid); - req_msg->alt_local_gid = alt_path->sgid; - req_msg->alt_remote_gid = alt_path->dgid; + *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg) = + alt_path->sgid; + *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg) = + alt_path->dgid; if (alt_ext) { - req_msg->alt_local_gid.global.interface_id - = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); - req_msg->alt_remote_gid.global.interface_id - = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); + IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, + req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); + IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, + req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); } if (alt_path->hop_limit <= 1) { - req_msg->alt_local_lid = alt_ext ? 0 : - htons(ntohl(sa_path_get_slid(alt_path))); - req_msg->alt_remote_lid = alt_ext ? 0 : - htons(ntohl(sa_path_get_dlid(alt_path))); + IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, + be16_to_cpu( + alt_ext ? 0 : + htons(ntohl(sa_path_get_slid( + alt_path))))); + IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, + be16_to_cpu( + alt_ext ? 0 : + htons(ntohl(sa_path_get_dlid( + alt_path))))); } else { - req_msg->alt_local_lid = IB_LID_PERMISSIVE; - req_msg->alt_remote_lid = IB_LID_PERMISSIVE; + IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); + IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); } - cm_req_set_alt_flow_label(req_msg, - alt_path->flow_label); - cm_req_set_alt_packet_rate(req_msg, alt_path->rate); - req_msg->alt_traffic_class = alt_path->traffic_class; - req_msg->alt_hop_limit = alt_path->hop_limit; - cm_req_set_alt_sl(req_msg, alt_path->sl); - cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1)); - cm_req_set_alt_local_ack_timeout(req_msg, + IBA_SET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg, + be32_to_cpu(alt_path->flow_label)); + IBA_SET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg, alt_path->rate); + IBA_SET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg, + alt_path->traffic_class); + IBA_SET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg, + alt_path->hop_limit); + IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, alt_path->sl); + IBA_SET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg, + (alt_path->hop_limit <= 1)); + IBA_SET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alt_path->packet_life_time)); } + IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id); if (param->private_data && param->private_data_len) - memcpy(req_msg->private_data, param->private_data, - param->private_data_len); + IBA_SET_MEM(CM_REQ_PRIVATE_DATA, req_msg, param->private_data, + param->private_data_len); } static int cm_validate_req_param(struct ib_cm_req_param *param) { - /* peer-to-peer not supported */ - if (param->peer_to_peer) - return -EINVAL; - if (!param->primary_path) return -EINVAL; @@ -1411,7 +1511,9 @@ static int cm_validate_req_param(struct ib_cm_req_param *param) int ib_send_cm_req(struct ib_cm_id *cm_id, struct ib_cm_req_param *param) { + struct cm_av av = {}, alt_av = {}; struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; struct cm_req_msg *req_msg; unsigned long flags; int ret; @@ -1423,10 +1525,9 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, /* Verify that we're not in timewait. */ cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_IDLE) { + if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = -EINVAL; - goto out; + return -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -1434,22 +1535,23 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); - goto out; + cm_id_priv->timewait_info = NULL; + return ret; } ret = cm_init_av_by_path(param->primary_path, - param->ppath_sgid_attr, &cm_id_priv->av, - cm_id_priv); + param->ppath_sgid_attr, &av); if (ret) - goto error1; + return ret; if (param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, NULL, - &cm_id_priv->alt_av, cm_id_priv); - if (ret) - goto error1; + &alt_av); + if (ret) { + cm_destroy_av(&av); + return ret; + } } cm_id->service_id = param->service_id; - cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = cm_convert_to_ms( param->primary_path->packet_life_time) * 2 + cm_convert_to_ms( @@ -1462,33 +1564,42 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, cm_id_priv->pkey = param->primary_path->pkey; cm_id_priv->qp_type = param->qp_type; - ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg); - if (ret) - goto error1; + spin_lock_irqsave(&cm_id_priv->lock, flags); + + cm_move_av_from_path(&cm_id_priv->av, &av); + if (param->primary_path_outbound) + cm_id_priv->av.dlid_datapath = + be16_to_cpu(param->primary_path_outbound->ib.dlid); - req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad; + if (param->alternate_path) + cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); + + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_REQ_SENT); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out_unlock; + } + + req_msg = (struct cm_req_msg *)msg->mad; cm_format_req(req_msg, cm_id_priv, param); cm_id_priv->tid = req_msg->hdr.tid; - cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms; - cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT; - cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg); - cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg); + cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); + cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); - spin_lock_irqsave(&cm_id_priv->lock, flags); - ret = ib_post_send_mad(cm_id_priv->msg, NULL); - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - goto error2; - } + trace_icm_send_req(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; BUG_ON(cm_id->state != IB_CM_IDLE); cm_id->state = IB_CM_REQ_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; - -error2: cm_free_msg(cm_id_priv->msg); -error1: kfree(cm_id_priv->timewait_info); -out: return ret; +out_free: + cm_free_priv_msg(msg); +out_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; } EXPORT_SYMBOL(ib_send_cm_req); @@ -1502,7 +1613,7 @@ static int cm_issue_rej(struct cm_port *port, struct cm_rej_msg *rej_msg, *rcv_msg; int ret; - ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + ret = cm_alloc_response_msg(port, mad_recv_wc, false, &msg); if (ret) return ret; @@ -1511,16 +1622,21 @@ static int cm_issue_rej(struct cm_port *port, rej_msg = (struct cm_rej_msg *) msg->mad; cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid); - rej_msg->remote_comm_id = rcv_msg->local_comm_id; - rej_msg->local_comm_id = rcv_msg->remote_comm_id; - cm_rej_set_msg_rejected(rej_msg, msg_rejected); - rej_msg->reason = cpu_to_be16(reason); + IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, + IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg)); + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, msg_rejected); + IBA_SET(CM_REJ_REASON, rej_msg, reason); if (ari && ari_length) { - cm_rej_set_reject_info_len(rej_msg, ari_length); - memcpy(rej_msg->ari, ari, ari_length); + IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length); + IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); } + trace_icm_issue_rej( + IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg), + IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); @@ -1528,21 +1644,15 @@ static int cm_issue_rej(struct cm_port *port, return ret; } -static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, - __be32 local_qpn, __be32 remote_qpn) -{ - return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) || - ((local_ca_guid == remote_ca_guid) && - (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); -} - static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) { - return ((req_msg->alt_local_lid) || - (ib_is_opa_gid(&req_msg->alt_local_gid))); + return ((cpu_to_be16( + IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg))) || + (ib_is_opa_gid(IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, + req_msg)))); } -static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, +static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num, struct sa_path_rec *path, union ib_gid *gid) { if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) @@ -1553,20 +1663,23 @@ static void cm_path_set_rec_type(struct ib_device *ib_device, u8 port_num, static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, - struct sa_path_rec *alt_path) + struct sa_path_rec *alt_path, + struct ib_wc *wc) { u32 lid; if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { - sa_path_set_dlid(primary_path, - ntohs(req_msg->primary_local_lid)); + sa_path_set_dlid(primary_path, wc->slid); sa_path_set_slid(primary_path, - ntohs(req_msg->primary_remote_lid)); + IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID, + req_msg)); } else { - lid = opa_get_lid_from_gid(&req_msg->primary_local_gid); + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg)); sa_path_set_dlid(primary_path, lid); - lid = opa_get_lid_from_gid(&req_msg->primary_remote_gid); + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg)); sa_path_set_slid(primary_path, lid); } @@ -1574,77 +1687,98 @@ static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, return; if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) { - sa_path_set_dlid(alt_path, ntohs(req_msg->alt_local_lid)); - sa_path_set_slid(alt_path, ntohs(req_msg->alt_remote_lid)); + sa_path_set_dlid(alt_path, + IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, + req_msg)); + sa_path_set_slid(alt_path, + IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, + req_msg)); } else { - lid = opa_get_lid_from_gid(&req_msg->alt_local_gid); + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg)); sa_path_set_dlid(alt_path, lid); - lid = opa_get_lid_from_gid(&req_msg->alt_remote_gid); + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg)); sa_path_set_slid(alt_path, lid); } } static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct sa_path_rec *primary_path, - struct sa_path_rec *alt_path) + struct sa_path_rec *alt_path, + struct ib_wc *wc) { - primary_path->dgid = req_msg->primary_local_gid; - primary_path->sgid = req_msg->primary_remote_gid; - primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); - primary_path->hop_limit = req_msg->primary_hop_limit; - primary_path->traffic_class = req_msg->primary_traffic_class; + primary_path->dgid = + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg); + primary_path->sgid = + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg); + primary_path->flow_label = + cpu_to_be32(IBA_GET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg)); + primary_path->hop_limit = IBA_GET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg); + primary_path->traffic_class = + IBA_GET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg); primary_path->reversible = 1; - primary_path->pkey = req_msg->pkey; - primary_path->sl = cm_req_get_primary_sl(req_msg); + primary_path->pkey = + cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); + primary_path->sl = IBA_GET(CM_REQ_PRIMARY_SL, req_msg); primary_path->mtu_selector = IB_SA_EQ; - primary_path->mtu = cm_req_get_path_mtu(req_msg); + primary_path->mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); primary_path->rate_selector = IB_SA_EQ; - primary_path->rate = cm_req_get_primary_packet_rate(req_msg); + primary_path->rate = IBA_GET(CM_REQ_PRIMARY_PACKET_RATE, req_msg); primary_path->packet_life_time_selector = IB_SA_EQ; primary_path->packet_life_time = - cm_req_get_primary_local_ack_timeout(req_msg); + IBA_GET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg); primary_path->packet_life_time -= (primary_path->packet_life_time > 0); - primary_path->service_id = req_msg->service_id; + primary_path->service_id = + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); if (sa_path_is_roce(primary_path)) primary_path->roce.route_resolved = false; if (cm_req_has_alt_path(req_msg)) { - alt_path->dgid = req_msg->alt_local_gid; - alt_path->sgid = req_msg->alt_remote_gid; - alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); - alt_path->hop_limit = req_msg->alt_hop_limit; - alt_path->traffic_class = req_msg->alt_traffic_class; + alt_path->dgid = *IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg); + alt_path->sgid = *IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg); + alt_path->flow_label = cpu_to_be32( + IBA_GET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg)); + alt_path->hop_limit = + IBA_GET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg); + alt_path->traffic_class = + IBA_GET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg); alt_path->reversible = 1; - alt_path->pkey = req_msg->pkey; - alt_path->sl = cm_req_get_alt_sl(req_msg); + alt_path->pkey = + cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); + alt_path->sl = IBA_GET(CM_REQ_ALTERNATE_SL, req_msg); alt_path->mtu_selector = IB_SA_EQ; - alt_path->mtu = cm_req_get_path_mtu(req_msg); + alt_path->mtu = + IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); alt_path->rate_selector = IB_SA_EQ; - alt_path->rate = cm_req_get_alt_packet_rate(req_msg); + alt_path->rate = IBA_GET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg); alt_path->packet_life_time_selector = IB_SA_EQ; alt_path->packet_life_time = - cm_req_get_alt_local_ack_timeout(req_msg); + IBA_GET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg); alt_path->packet_life_time -= (alt_path->packet_life_time > 0); - alt_path->service_id = req_msg->service_id; + alt_path->service_id = + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); if (sa_path_is_roce(alt_path)) alt_path->roce.route_resolved = false; } - cm_format_path_lid_from_req(req_msg, primary_path, alt_path); + cm_format_path_lid_from_req(req_msg, primary_path, alt_path, wc); } static u16 cm_get_bth_pkey(struct cm_work *work) { struct ib_device *ib_dev = work->port->cm_dev->ib_device; - u8 port_num = work->port->port_num; + u32 port_num = work->port->port_num; u16 pkey_index = work->mad_recv_wc->wc->pkey_index; u16 pkey; int ret; ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); if (ret) { - dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n", + dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %u, pkey index %u). %d\n", port_num, pkey_index, ret); return 0; } @@ -1653,7 +1787,7 @@ static u16 cm_get_bth_pkey(struct cm_work *work) } /** - * Convert OPA SGID to IB SGID + * cm_opa_to_ib_sgid - Convert OPA SGID to IB SGID * ULPs (such as IPoIB) do not understand OPA GIDs and will * reject them as the local_gid will not match the sgid. Therefore, * change the pathrec's SGID to an IB SGID. @@ -1665,7 +1799,7 @@ static void cm_opa_to_ib_sgid(struct cm_work *work, struct sa_path_rec *path) { struct ib_device *dev = work->port->cm_dev->ib_device; - u8 port_num = work->port->port_num; + u32 port_num = work->port->port_num; if (rdma_cap_opa_ah(dev, port_num) && (ib_is_opa_gid(&path->sgid))) { @@ -1701,23 +1835,28 @@ static void cm_format_req_event(struct cm_work *work, } else { param->alternate_path = NULL; } - param->remote_ca_guid = req_msg->local_ca_guid; - param->remote_qkey = be32_to_cpu(req_msg->local_qkey); - param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg)); + param->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg)); + param->remote_qkey = IBA_GET(CM_REQ_LOCAL_Q_KEY, req_msg); + param->remote_qpn = IBA_GET(CM_REQ_LOCAL_QPN, req_msg); param->qp_type = cm_req_get_qp_type(req_msg); - param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg)); - param->responder_resources = cm_req_get_init_depth(req_msg); - param->initiator_depth = cm_req_get_resp_res(req_msg); + param->starting_psn = IBA_GET(CM_REQ_STARTING_PSN, req_msg); + param->responder_resources = IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); + param->initiator_depth = IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); param->local_cm_response_timeout = - cm_req_get_remote_resp_timeout(req_msg); - param->flow_control = cm_req_get_flow_ctrl(req_msg); + IBA_GET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg); + param->flow_control = IBA_GET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg); param->remote_cm_response_timeout = - cm_req_get_local_resp_timeout(req_msg); - param->retry_count = cm_req_get_retry_count(req_msg); - param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); - param->srq = cm_req_get_srq(req_msg); + IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg); + param->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); + param->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); + param->srq = IBA_GET(CM_REQ_SRQ, req_msg); param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; - work->cm_event.private_data = &req_msg->private_data; + param->ece.vendor_id = IBA_GET(CM_REQ_VENDOR_ID, req_msg); + param->ece.attr_mod = be32_to_cpu(req_msg->hdr.attr_mod); + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_REQ_PRIVATE_DATA, req_msg); } static void cm_process_work(struct cm_id_private *cm_id_priv, @@ -1747,58 +1886,67 @@ static void cm_process_work(struct cm_id_private *cm_id_priv, static void cm_format_mra(struct cm_mra_msg *mra_msg, struct cm_id_private *cm_id_priv, - enum cm_msg_response msg_mraed, u8 service_timeout, + enum cm_msg_response msg_mraed, const void *private_data, u8 private_data_len) { cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); - cm_mra_set_msg_mraed(mra_msg, msg_mraed); - mra_msg->local_comm_id = cm_id_priv->id.local_id; - mra_msg->remote_comm_id = cm_id_priv->id.remote_id; - cm_mra_set_service_timeout(mra_msg, service_timeout); + IBA_SET(CM_MRA_MESSAGE_MRAED, mra_msg, msg_mraed); + IBA_SET(CM_MRA_LOCAL_COMM_ID, mra_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING); if (private_data && private_data_len) - memcpy(mra_msg->private_data, private_data, private_data_len); + IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data, + private_data_len); } static void cm_format_rej(struct cm_rej_msg *rej_msg, struct cm_id_private *cm_id_priv, - enum ib_cm_rej_reason reason, - void *ari, - u8 ari_length, - const void *private_data, - u8 private_data_len) + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len, enum ib_cm_state state) { + lockdep_assert_held(&cm_id_priv->lock); + cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid); - rej_msg->remote_comm_id = cm_id_priv->id.remote_id; + IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); - switch(cm_id_priv->id.state) { + switch (state) { case IB_CM_REQ_RCVD: - rej_msg->local_comm_id = 0; - cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(0)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_MRA_REQ_SENT: - rej_msg->local_comm_id = cm_id_priv->id.local_id; - cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ); + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ); break; case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: - rej_msg->local_comm_id = cm_id_priv->id.local_id; - cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP); + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REP); break; default: - rej_msg->local_comm_id = cm_id_priv->id.local_id; - cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER); + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, + CM_MSG_RESPONSE_OTHER); break; } - rej_msg->reason = cpu_to_be16(reason); + IBA_SET(CM_REJ_REASON, rej_msg, reason); if (ari && ari_length) { - cm_rej_set_reject_info_len(rej_msg, ari_length); - memcpy(rej_msg->ari, ari, ari_length); + IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length); + IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); } if (private_data && private_data_len) - memcpy(rej_msg->private_data, private_data, private_data_len); + IBA_SET_MEM(CM_REJ_PRIVATE_DATA, rej_msg, private_data, + private_data_len); } static void cm_dup_req_handler(struct cm_work *work, @@ -1807,14 +1955,18 @@ static void cm_dup_req_handler(struct cm_work *work, struct ib_mad_send_buf *msg = NULL; int ret; - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_REQ_COUNTER]); + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES][CM_REQ_COUNTER]); /* Quick state check to discard duplicate REQs. */ - if (cm_id_priv->id.state == IB_CM_REQ_RCVD) + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state == IB_CM_REQ_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); return; + } + spin_unlock_irq(&cm_id_priv->lock); - ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) return; @@ -1822,19 +1974,21 @@ static void cm_dup_req_handler(struct cm_work *work, switch (cm_id_priv->id.state) { case IB_CM_MRA_REQ_SENT: cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REQ, cm_id_priv->private_data, cm_id_priv->private_data_len); break; case IB_CM_TIMEWAIT: - cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv, - IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, + IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0, + IB_CM_TIMEWAIT); break; default: goto unlock; } spin_unlock_irq(&cm_id_priv->lock); + trace_icm_send_dup_req(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; @@ -1844,13 +1998,12 @@ unlock: spin_unlock_irq(&cm_id_priv->lock); free: cm_free_msg(msg); } -static struct cm_id_private * cm_match_req(struct cm_work *work, - struct cm_id_private *cm_id_priv) +static struct cm_id_private *cm_match_req(struct cm_work *work, + struct cm_id_private *cm_id_priv) { struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; struct cm_timewait_info *timewait_info; struct cm_req_msg *req_msg; - struct ib_cm_id *cm_id; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1858,7 +2011,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, spin_lock_irq(&cm.lock); timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info); if (timewait_info) { - cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, + cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock_irq(&cm.lock); if (cur_cm_id_priv) { @@ -1871,8 +2024,8 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, /* Check for stale connections. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { - cm_cleanup_timewait(cm_id_priv->timewait_info); - cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, + cm_remove_remote(cm_id_priv); + cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock_irq(&cm.lock); @@ -1880,30 +2033,25 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, NULL, 0); if (cur_cm_id_priv) { - cm_id = &cur_cm_id_priv->id; - ib_send_cm_dreq(cm_id, NULL, 0); + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } return NULL; } /* Find matching listen request. */ - listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, - req_msg->service_id); + listen_cm_id_priv = cm_find_listen( + cm_id_priv->id.device, + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg))); if (!listen_cm_id_priv) { - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, NULL, 0); - goto out; + return NULL; } - atomic_inc(&listen_cm_id_priv->refcount); - atomic_inc(&cm_id_priv->refcount); - cm_id_priv->id.state = IB_CM_REQ_RCVD; - atomic_inc(&cm_id_priv->work_count); spin_unlock_irq(&cm.lock); -out: return listen_cm_id_priv; } @@ -1914,30 +2062,37 @@ out: */ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { - if (!cm_req_get_primary_subnet_local(req_msg)) { - if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { - req_msg->primary_local_lid = ib_lid_be16(wc->slid); - cm_req_set_primary_sl(req_msg, wc->sl); + if (!IBA_GET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg)) { + if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) { + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(ib_lid_be16(wc->slid))); + IBA_SET(CM_REQ_PRIMARY_SL, req_msg, wc->sl); } - if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE) - req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits); + if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) + IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, + wc->dlid_path_bits); } - if (!cm_req_get_alt_subnet_local(req_msg)) { - if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { - req_msg->alt_local_lid = ib_lid_be16(wc->slid); - cm_req_set_alt_sl(req_msg, wc->sl); + if (!IBA_GET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg)) { + if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) { + IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, + be16_to_cpu(ib_lid_be16(wc->slid))); + IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, wc->sl); } - if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE) - req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits); + if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) + IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, + wc->dlid_path_bits); } } static int cm_req_handler(struct cm_work *work) { - struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; const struct ib_global_route *grh; @@ -1946,12 +2101,32 @@ static int cm_req_handler(struct cm_work *work) req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); - if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); + cm_id_priv = + cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id_priv)) + return PTR_ERR(cm_id_priv); + + cm_id_priv->id.remote_id = + cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg)); + cm_id_priv->id.service_id = + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); + cm_id_priv->tid = req_msg->hdr.tid; + cm_id_priv->timeout_ms = cm_convert_to_ms( + IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg)); + cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg); + cm_id_priv->remote_qpn = + cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); + cm_id_priv->initiator_depth = + IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); + cm_id_priv->responder_resources = + IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); + cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); + cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); + cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); + cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); + cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); + cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - cm_id_priv->id.remote_id = req_msg->local_comm_id; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); @@ -1961,53 +2136,57 @@ static int cm_req_handler(struct cm_work *work) id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; goto destroy; } - cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id; - cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid; - cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg); + cm_id_priv->timewait_info->work.remote_id = cm_id_priv->id.remote_id; + cm_id_priv->timewait_info->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg)); + cm_id_priv->timewait_info->remote_qpn = cm_id_priv->remote_qpn; + + /* + * Note that the ID pointer is not in the xarray at this point, + * so this set is only visible to the local thread. + */ + cm_id_priv->id.state = IB_CM_REQ_RCVD; listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { - pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, - be32_to_cpu(cm_id->local_id)); + trace_icm_no_listener_err(&cm_id_priv->id); + cm_id_priv->id.state = IB_CM_IDLE; ret = -EINVAL; - goto free_timeinfo; + goto destroy; } - cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; - cm_id_priv->id.context = listen_cm_id_priv->id.context; - cm_id_priv->id.service_id = req_msg->service_id; - cm_id_priv->id.service_mask = ~cpu_to_be64(0); - - cm_process_routed_req(req_msg, work->mad_recv_wc->wc); - memset(&work->path[0], 0, sizeof(work->path[0])); if (cm_req_has_alt_path(req_msg)) memset(&work->path[1], 0, sizeof(work->path[1])); grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr); gid_attr = grh->sgid_attr; - if (gid_attr && gid_attr->ndev) { + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) { work->path[0].rec_type = sa_conv_gid_to_pathrec_type(gid_attr->gid_type); } else { - /* If no GID attribute or ndev is null, it is not RoCE. */ - cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &req_msg->primary_local_gid); + cm_process_routed_req(req_msg, work->mad_recv_wc->wc); + cm_path_set_rec_type( + work->port->cm_dev->ib_device, work->port->port_num, + &work->path[0], + IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, + req_msg)); } if (cm_req_has_alt_path(req_msg)) work->path[1].rec_type = work->path[0].rec_type; cm_format_paths_from_req(req_msg, &work->path[0], - &work->path[1]); + &work->path[1], work->mad_recv_wc->wc); if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) sa_path_set_dmac(&work->path[0], cm_id_priv->av.ah_attr.roce.dmac); work->path[0].hop_limit = grh->hop_limit; - ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av, - cm_id_priv); + + /* This destroy call is needed to pair with cm_init_av_for_response */ + cm_destroy_av(&cm_id_priv->av); + ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av); if (ret) { int err; @@ -2015,51 +2194,55 @@ static int cm_req_handler(struct cm_work *work) work->port->port_num, 0, &work->path[0].sgid); if (err) - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, NULL, 0, NULL, 0); else - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof(work->path[0].sgid), NULL, 0); goto rejected; } + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_IB) + cm_id_priv->av.dlid_datapath = + IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg); + if (cm_req_has_alt_path(req_msg)) { ret = cm_init_av_by_path(&work->path[1], NULL, - &cm_id_priv->alt_av, cm_id_priv); + &cm_id_priv->alt_av); if (ret) { - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, + ib_send_cm_rej(&cm_id_priv->id, + IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, sizeof(work->path[0].sgid), NULL, 0); goto rejected; } } - cm_id_priv->tid = req_msg->hdr.tid; - cm_id_priv->timeout_ms = cm_convert_to_ms( - cm_req_get_local_resp_timeout(req_msg)); - cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg); - cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg); - cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg); - cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg); - cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg); - cm_id_priv->pkey = req_msg->pkey; - cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg); - cm_id_priv->retry_count = cm_req_get_retry_count(req_msg); - cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); - cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); + cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = listen_cm_id_priv->id.context; cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); - cm_process_work(cm_id_priv, work); + + /* Now MAD handlers can see the new ID */ + spin_lock_irq(&cm_id_priv->lock); + cm_finalize_id(cm_id_priv); + + /* Refcount belongs to the event, pairs with cm_process_work() */ + refcount_inc(&cm_id_priv->refcount); + cm_queue_work_unlock(cm_id_priv, work); + /* + * Since this ID was just created and was not made visible to other MAD + * handlers until the cm_finalize_id() above we know that the + * cm_process_work() will deliver the event and the listen_cm_id + * embedded in the event can be derefed here. + */ cm_deref_id(listen_cm_id_priv); return 0; rejected: - atomic_dec(&cm_id_priv->refcount); cm_deref_id(listen_cm_id_priv); -free_timeinfo: - kfree(cm_id_priv->timewait_info); destroy: - ib_destroy_cm_id(cm_id); + ib_destroy_cm_id(&cm_id_priv->id); return ret; } @@ -2067,30 +2250,41 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_rep_param *param) { - cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); - rep_msg->local_comm_id = cm_id_priv->id.local_id; - rep_msg->remote_comm_id = cm_id_priv->id.remote_id; - cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn)); - rep_msg->resp_resources = param->responder_resources; - cm_rep_set_target_ack_delay(rep_msg, - cm_id_priv->av.port->cm_dev->ack_delay); - cm_rep_set_failover(rep_msg, param->failover_accepted); - cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count); - rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid; + cm_format_mad_ece_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid, + param->ece.attr_mod); + IBA_SET(CM_REP_LOCAL_COMM_ID, rep_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REP_REMOTE_COMM_ID, rep_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn); + IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg, + param->responder_resources); + IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, + cm_id_priv->av.port->cm_dev->ack_delay); + IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted); + IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count); + IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg, + be64_to_cpu(cm_id_priv->id.device->node_guid)); if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { - rep_msg->initiator_depth = param->initiator_depth; - cm_rep_set_flow_ctrl(rep_msg, param->flow_control); - cm_rep_set_srq(rep_msg, param->srq); - cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num)); + IBA_SET(CM_REP_INITIATOR_DEPTH, rep_msg, + param->initiator_depth); + IBA_SET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg, + param->flow_control); + IBA_SET(CM_REP_SRQ, rep_msg, param->srq); + IBA_SET(CM_REP_LOCAL_QPN, rep_msg, param->qp_num); } else { - cm_rep_set_srq(rep_msg, 1); - cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num)); + IBA_SET(CM_REP_SRQ, rep_msg, 1); + IBA_SET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, rep_msg, param->qp_num); } + IBA_SET(CM_REP_VENDOR_ID_L, rep_msg, param->ece.vendor_id); + IBA_SET(CM_REP_VENDOR_ID_M, rep_msg, param->ece.vendor_id >> 8); + IBA_SET(CM_REP_VENDOR_ID_H, rep_msg, param->ece.vendor_id >> 16); + if (param->private_data && param->private_data_len) - memcpy(rep_msg->private_data, param->private_data, - param->private_data_len); + IBA_SET_MEM(CM_REP_PRIVATE_DATA, rep_msg, param->private_data, + param->private_data_len); } int ib_send_cm_rep(struct ib_cm_id *cm_id, @@ -2110,36 +2304,40 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { - pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); + trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); ret = -EINVAL; goto out; } - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) + msg = cm_alloc_priv_msg_rep(cm_id_priv, IB_CM_REP_SENT, true); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); goto out; + } rep_msg = (struct cm_rep_msg *) msg->mad; cm_format_rep(rep_msg, cm_id_priv, param); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; + trace_icm_send_rep(cm_id); ret = ib_post_send_mad(msg, NULL); - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); - return ret; - } + if (ret) + goto out_free; cm_id->state = IB_CM_REP_SENT; - cm_id_priv->msg = msg; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; - cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg); + cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg)); + WARN_ONCE(param->qp_num & 0xFF000000, + "IBTA declares QPN to be 24 bits, but it is 0x%X\n", + param->qp_num); cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +out_free: + cm_free_priv_msg(msg); +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rep); @@ -2150,11 +2348,14 @@ static void cm_format_rtu(struct cm_rtu_msg *rtu_msg, u8 private_data_len) { cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid); - rtu_msg->local_comm_id = cm_id_priv->id.local_id; - rtu_msg->remote_comm_id = cm_id_priv->id.remote_id; + IBA_SET(CM_RTU_LOCAL_COMM_ID, rtu_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_RTU_REMOTE_COMM_ID, rtu_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); if (private_data && private_data_len) - memcpy(rtu_msg->private_data, private_data, private_data_len); + IBA_SET_MEM(CM_RTU_PRIVATE_DATA, rtu_msg, private_data, + private_data_len); } int ib_send_cm_rtu(struct ib_cm_id *cm_id, @@ -2178,19 +2379,21 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { - pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_icm_send_cm_rtu_err(cm_id); ret = -EINVAL; goto error; } - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); goto error; + } cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, private_data, private_data_len); + trace_icm_send_rtu(cm_id); ret = ib_post_send_mad(msg, NULL); if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -2217,18 +2420,25 @@ static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rep_rcvd; - param->remote_ca_guid = rep_msg->local_ca_guid; - param->remote_qkey = be32_to_cpu(rep_msg->local_qkey); + param->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg)); + param->remote_qkey = IBA_GET(CM_REP_LOCAL_Q_KEY, rep_msg); param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); - param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg)); - param->responder_resources = rep_msg->initiator_depth; - param->initiator_depth = rep_msg->resp_resources; - param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); - param->failover_accepted = cm_rep_get_failover(rep_msg); - param->flow_control = cm_rep_get_flow_ctrl(rep_msg); - param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); - param->srq = cm_rep_get_srq(rep_msg); - work->cm_event.private_data = &rep_msg->private_data; + param->starting_psn = IBA_GET(CM_REP_STARTING_PSN, rep_msg); + param->responder_resources = IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg); + param->initiator_depth = IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg); + param->target_ack_delay = IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg); + param->failover_accepted = IBA_GET(CM_REP_FAILOVER_ACCEPTED, rep_msg); + param->flow_control = IBA_GET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg); + param->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); + param->srq = IBA_GET(CM_REP_SRQ, rep_msg); + param->ece.vendor_id = IBA_GET(CM_REP_VENDOR_ID_H, rep_msg) << 16; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_M, rep_msg) << 8; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_L, rep_msg); + param->ece.attr_mod = be32_to_cpu(rep_msg->hdr.attr_mod); + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_REP_PRIVATE_DATA, rep_msg); } static void cm_dup_rep_handler(struct cm_work *work) @@ -2239,14 +2449,15 @@ static void cm_dup_rep_handler(struct cm_work *work) int ret; rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, - rep_msg->local_comm_id); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), + cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg))); if (!cm_id_priv) return; - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_REP_COUNTER]); - ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, true, &msg); if (ret) goto deref; @@ -2257,13 +2468,14 @@ static void cm_dup_rep_handler(struct cm_work *work) cm_id_priv->private_data_len); else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, + CM_MSG_RESPONSE_REP, cm_id_priv->private_data, cm_id_priv->private_data_len); else goto unlock; spin_unlock_irq(&cm_id_priv->lock); + trace_icm_send_dup_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) goto free; @@ -2280,15 +2492,15 @@ static int cm_rep_handler(struct cm_work *work) struct cm_rep_msg *rep_msg; int ret; struct cm_id_private *cur_cm_id_priv; - struct ib_cm_id *cm_id; struct cm_timewait_info *timewait_info; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); if (!cm_id_priv) { cm_dup_rep_handler(work); - pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, - be32_to_cpu(rep_msg->remote_comm_id)); + trace_icm_remote_no_priv_err( + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); return -EINVAL; } @@ -2300,17 +2512,19 @@ static int cm_rep_handler(struct cm_work *work) case IB_CM_MRA_REQ_RCVD: break; default: - spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", - __func__, cm_id_priv->id.state, - be32_to_cpu(rep_msg->local_comm_id), - be32_to_cpu(rep_msg->remote_comm_id)); + trace_icm_rep_unknown_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), + cm_id_priv->id.state); + spin_unlock_irq(&cm_id_priv->lock); goto error; } - cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id; - cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid; + cm_id_priv->timewait_info->work.remote_id = + cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)); + cm_id_priv->timewait_info->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg)); cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); spin_lock(&cm.lock); @@ -2319,17 +2533,15 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("%s: Failed to insert remote id %d\n", __func__, - be32_to_cpu(rep_msg->remote_comm_id)); + trace_icm_insert_failed_err( + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); goto error; } /* Check for a stale connection. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { - rb_erase(&cm_id_priv->timewait_info->remote_id_node, - &cm.remote_id_table); - cm_id_priv->timewait_info->inserted_remote_id = 0; - cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, + cm_remove_remote(cm_id_priv); + cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); spin_unlock(&cm.lock); @@ -2338,13 +2550,12 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; - pr_debug("%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", - __func__, be32_to_cpu(rep_msg->local_comm_id), - be32_to_cpu(rep_msg->remote_comm_id)); + trace_icm_staleconn_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); if (cur_cm_id_priv) { - cm_id = &cur_cm_id_priv->id; - ib_send_cm_dreq(cm_id, NULL, 0); + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } @@ -2353,13 +2564,17 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); cm_id_priv->id.state = IB_CM_REP_RCVD; - cm_id_priv->id.remote_id = rep_msg->local_comm_id; + cm_id_priv->id.remote_id = + cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)); cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); - cm_id_priv->initiator_depth = rep_msg->resp_resources; - cm_id_priv->responder_resources = rep_msg->initiator_depth; - cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg); - cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg); - cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg); + cm_id_priv->initiator_depth = + IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg); + cm_id_priv->responder_resources = + IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg); + cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg)); + cm_id_priv->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); + cm_id_priv->target_ack_delay = + IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg); cm_id_priv->av.timeout = cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->av.timeout - 1); @@ -2367,18 +2582,8 @@ static int cm_rep_handler(struct cm_work *work) cm_ack_timeout(cm_id_priv->target_ack_delay, cm_id_priv->alt_av.timeout - 1); - /* todo: handle peer_to_peer */ - - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; error: @@ -2389,7 +2594,6 @@ error: static int cm_establish_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; - int ret; /* See comment in cm_establish about lookup. */ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); @@ -2402,16 +2606,8 @@ static int cm_establish_handler(struct cm_work *work) goto out; } - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -2422,36 +2618,29 @@ static int cm_rtu_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rtu_msg *rtu_msg; - int ret; rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id, - rtu_msg->local_comm_id); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_RTU_REMOTE_COMM_ID, rtu_msg)), + cpu_to_be32(IBA_GET(CM_RTU_LOCAL_COMM_ID, rtu_msg))); if (!cm_id_priv) return -EINVAL; - work->cm_event.private_data = &rtu_msg->private_data; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_RTU_PRIVATE_DATA, rtu_msg); spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_REP_SENT && cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { spin_unlock_irq(&cm_id_priv->lock); - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_RTU_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_RTU_COUNTER]); goto out; } cm_id_priv->id.state = IB_CM_ESTABLISHED; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -2465,19 +2654,42 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, { cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID, cm_form_tid(cm_id_priv)); - dreq_msg->local_comm_id = cm_id_priv->id.local_id; - dreq_msg->remote_comm_id = cm_id_priv->id.remote_id; - cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn); + IBA_SET(CM_DREQ_LOCAL_COMM_ID, dreq_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_DREQ_REMOTE_COMM_ID, dreq_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg, + be32_to_cpu(cm_id_priv->remote_qpn)); if (private_data && private_data_len) - memcpy(dreq_msg->private_data, private_data, private_data_len); + IBA_SET_MEM(CM_DREQ_PRIVATE_DATA, dreq_msg, private_data, + private_data_len); +} + +static void cm_issue_dreq(struct cm_id_private *cm_id_priv) +{ + struct ib_mad_send_buf *msg; + int ret; + + lockdep_assert_held(&cm_id_priv->lock); + + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return; + + cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, NULL, 0); + + trace_icm_send_dreq(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_msg(msg); } -int ib_send_cm_dreq(struct ib_cm_id *cm_id, - const void *private_data, +int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len) { - struct cm_id_private *cm_id_priv; + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); struct ib_mad_send_buf *msg; unsigned long flags; int ret; @@ -2485,41 +2697,38 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id, if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) return -EINVAL; - cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_ESTABLISHED) { - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { + trace_icm_dreq_skipped(&cm_id_priv->id); ret = -EINVAL; - goto out; + goto unlock; } - if (cm_id->lap_state == IB_CM_LAP_SENT || - cm_id->lap_state == IB_CM_MRA_LAP_RCVD) - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || + cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->msg); - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) { + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_DREQ_SENT); + if (IS_ERR(msg)) { cm_enter_timewait(cm_id_priv); - goto out; + ret = PTR_ERR(msg); + goto unlock; } cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, private_data, private_data_len); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; + trace_icm_send_dreq(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { cm_enter_timewait(cm_id_priv); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); - return ret; + cm_free_priv_msg(msg); + goto unlock; } - cm_id->state = IB_CM_DREQ_SENT; - cm_id_priv->msg = msg; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_id_priv->id.state = IB_CM_DREQ_SENT; +unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_dreq); @@ -2530,58 +2739,68 @@ static void cm_format_drep(struct cm_drep_msg *drep_msg, u8 private_data_len) { cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid); - drep_msg->local_comm_id = cm_id_priv->id.local_id; - drep_msg->remote_comm_id = cm_id_priv->id.remote_id; + IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); if (private_data && private_data_len) - memcpy(drep_msg->private_data, private_data, private_data_len); + IBA_SET_MEM(CM_DREP_PRIVATE_DATA, drep_msg, private_data, + private_data_len); } -int ib_send_cm_drep(struct ib_cm_id *cm_id, - const void *private_data, - u8 private_data_len) +static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len) { - struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; - unsigned long flags; - void *data; int ret; + lockdep_assert_held(&cm_id_priv->lock); + if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE) return -EINVAL; - data = cm_copy_private_data(private_data, private_data_len); - if (IS_ERR(data)) - return PTR_ERR(data); - - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_DREQ_RCVD) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); - pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", - __func__, be32_to_cpu(cm_id->local_id), cm_id->state); + if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { + trace_icm_send_drep_err(&cm_id_priv->id); + kfree(private_data); return -EINVAL; } - cm_set_private_data(cm_id_priv, data, private_data_len); + cm_set_private_data(cm_id_priv, private_data, private_data_len); cm_enter_timewait(cm_id_priv); - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto out; + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, private_data, private_data_len); + trace_icm_send_drep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } + return 0; +} -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + void *data; + int ret; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_drep_locked(cm_id_priv, data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_drep); @@ -2594,7 +2813,7 @@ static int cm_issue_drep(struct cm_port *port, struct cm_drep_msg *drep_msg; int ret; - ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + ret = cm_alloc_response_msg(port, mad_recv_wc, true, &msg); if (ret) return ret; @@ -2602,9 +2821,14 @@ static int cm_issue_drep(struct cm_port *port, drep_msg = (struct cm_drep_msg *) msg->mad; cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid); - drep_msg->remote_comm_id = dreq_msg->local_comm_id; - drep_msg->local_comm_id = dreq_msg->remote_comm_id; - + IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg, + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg)); + IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); + + trace_icm_issue_drep( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ret = ib_post_send_mad(msg, NULL); if (ret) cm_free_msg(msg); @@ -2617,43 +2841,45 @@ static int cm_dreq_handler(struct cm_work *work) struct cm_id_private *cm_id_priv; struct cm_dreq_msg *dreq_msg; struct ib_mad_send_buf *msg = NULL; - int ret; dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id, - dreq_msg->local_comm_id); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)), + cpu_to_be32(IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg))); if (!cm_id_priv) { - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_DREQ_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); - pr_debug("%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", - __func__, be32_to_cpu(dreq_msg->local_comm_id), - be32_to_cpu(dreq_msg->remote_comm_id)); + trace_icm_no_priv_err( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); return -EINVAL; } - work->cm_event.private_data = &dreq_msg->private_data; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_DREQ_PRIVATE_DATA, dreq_msg); spin_lock_irq(&cm_id_priv->lock); - if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg)) + if (cm_id_priv->local_qpn != + cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg))) goto unlock; switch (cm_id_priv->id.state) { case IB_CM_REP_SENT: case IB_CM_DREQ_SENT: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_ESTABLISHED: if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - break; - case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->msg); break; case IB_CM_TIMEWAIT: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_DREQ_COUNTER]); - msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, + true); if (IS_ERR(msg)) goto unlock; @@ -2667,26 +2893,16 @@ static int cm_dreq_handler(struct cm_work *work) cm_free_msg(msg); goto deref; case IB_CM_DREQ_RCVD: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_DREQ_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); goto unlock; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_dreq_unknown_err(&cm_id_priv->id); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; cm_id_priv->tid = dreq_msg->hdr.tid; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); @@ -2698,15 +2914,16 @@ static int cm_drep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_drep_msg *drep_msg; - int ret; drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id, - drep_msg->local_comm_id); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_DREP_REMOTE_COMM_ID, drep_msg)), + cpu_to_be32(IBA_GET(CM_DREP_LOCAL_COMM_ID, drep_msg))); if (!cm_id_priv) return -EINVAL; - work->cm_event.private_data = &drep_msg->private_data; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_DREP_PRIVATE_DATA, drep_msg); spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_DREQ_SENT && @@ -2716,81 +2933,83 @@ static int cm_drep_handler(struct cm_work *work) } cm_enter_timewait(cm_id_priv); - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } -int ib_send_cm_rej(struct ib_cm_id *cm_id, - enum ib_cm_rej_reason reason, - void *ari, - u8 ari_length, - const void *private_data, - u8 private_data_len) +static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len) { - struct cm_id_private *cm_id_priv; + enum ib_cm_state state = cm_id_priv->id.state; struct ib_mad_send_buf *msg; - unsigned long flags; int ret; + lockdep_assert_held(&cm_id_priv->lock); + if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) || (ari && ari_length > IB_CM_REJ_ARI_LENGTH)) return -EINVAL; - cm_id_priv = container_of(cm_id, struct cm_id_private, id); + trace_icm_send_rej(&cm_id_priv->id, reason); - spin_lock_irqsave(&cm_id_priv->lock, flags); - switch (cm_id->state) { + switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: - ret = cm_alloc_msg(cm_id_priv, &msg); - if (!ret) - cm_format_rej((struct cm_rej_msg *) msg->mad, - cm_id_priv, reason, ari, ari_length, - private_data, private_data_len); - cm_reset_to_idle(cm_id_priv); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, + ari, ari_length, private_data, private_data_len, + state); break; case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: - ret = cm_alloc_msg(cm_id_priv, &msg); - if (!ret) - cm_format_rej((struct cm_rej_msg *) msg->mad, - cm_id_priv, reason, ari, ari_length, - private_data, private_data_len); - cm_enter_timewait(cm_id_priv); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, + ari, ari_length, private_data, private_data_len, + state); break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); - ret = -EINVAL; - goto out; + trace_icm_send_unknown_rej_err(&cm_id_priv->id); + return -EINVAL; } - if (ret) - goto out; - ret = ib_post_send_mad(msg, NULL); - if (ret) + if (ret) { cm_free_msg(msg); + return ret; + } -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; +} + +int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, + void *ari, u8 ari_length, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_rej_locked(cm_id_priv, reason, ari, ari_length, + private_data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_rej); @@ -2802,42 +3021,33 @@ static void cm_format_rej_event(struct cm_work *work) rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.rej_rcvd; - param->ari = rej_msg->ari; - param->ari_length = cm_rej_get_reject_info_len(rej_msg); - param->reason = __be16_to_cpu(rej_msg->reason); - work->cm_event.private_data = &rej_msg->private_data; + param->ari = IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg); + param->ari_length = IBA_GET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg); + param->reason = IBA_GET(CM_REJ_REASON, rej_msg); + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_REJ_PRIVATE_DATA, rej_msg); } -static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) +static struct cm_id_private *cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) { - struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; __be32 remote_id; - remote_id = rej_msg->local_comm_id; - - if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) { - spin_lock_irq(&cm.lock); - timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari), - remote_id); - if (!timewait_info) { - spin_unlock_irq(&cm.lock); - return NULL; - } - cm_id_priv = idr_find(&cm.local_id_table, (__force int) - (timewait_info->work.local_id ^ - cm.random_id_operand)); - if (cm_id_priv) { - if (cm_id_priv->id.remote_id == remote_id) - atomic_inc(&cm_id_priv->refcount); - else - cm_id_priv = NULL; - } - spin_unlock_irq(&cm.lock); - } else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ) - cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0); + remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg)); + + if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) { + cm_id_priv = cm_find_remote_id( + *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)), + remote_id); + } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) == + CM_MSG_RESPONSE_REQ) + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), + 0); else - cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), + remote_id); return cm_id_priv; } @@ -2846,7 +3056,6 @@ static int cm_rej_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rej_msg *rej_msg; - int ret; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_rejected_id(rej_msg); @@ -2861,18 +3070,18 @@ static int cm_rej_handler(struct cm_work *work) case IB_CM_MRA_REQ_RCVD: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - /* fall through */ + ib_cancel_mad(cm_id_priv->msg); + fallthrough; case IB_CM_REQ_RCVD: case IB_CM_MRA_REQ_SENT: - if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN) + if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_STALE_CONN) cm_enter_timewait(cm_id_priv); else cm_reset_to_idle(cm_id_priv); break; case IB_CM_DREQ_SENT: - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - /* fall through */ + ib_cancel_mad(cm_id_priv->msg); + fallthrough; case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: cm_enter_timewait(cm_id_priv); @@ -2881,127 +3090,79 @@ static int cm_rej_handler(struct cm_work *work) if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) - ib_cancel_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg); + ib_cancel_mad(cm_id_priv->msg); cm_enter_timewait(cm_id_priv); break; } - /* fall through */ + fallthrough; default: + trace_icm_rej_unknown_err(&cm_id_priv->id); spin_unlock_irq(&cm_id_priv->lock); - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); - ret = -EINVAL; goto out; } - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); return -EINVAL; } -int ib_send_cm_mra(struct ib_cm_id *cm_id, - u8 service_timeout, - const void *private_data, - u8 private_data_len) +int ib_prepare_cm_mra(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; - struct ib_mad_send_buf *msg; enum ib_cm_state cm_state; enum ib_cm_lap_state lap_state; - enum cm_msg_response msg_response; - void *data; unsigned long flags; - int ret; - - if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) - return -EINVAL; - - data = cm_copy_private_data(private_data, private_data_len); - if (IS_ERR(data)) - return PTR_ERR(data); + int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); - switch(cm_id_priv->id.state) { + switch (cm_id_priv->id.state) { case IB_CM_REQ_RCVD: cm_state = IB_CM_MRA_REQ_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REQ; break; case IB_CM_REP_RCVD: cm_state = IB_CM_MRA_REP_SENT; lap_state = cm_id->lap_state; - msg_response = CM_MSG_RESPONSE_REP; break; case IB_CM_ESTABLISHED: if (cm_id->lap_state == IB_CM_LAP_RCVD) { cm_state = cm_id->state; lap_state = IB_CM_MRA_LAP_SENT; - msg_response = CM_MSG_RESPONSE_OTHER; break; } - /* fall through */ + fallthrough; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_prepare_mra_unknown_err(&cm_id_priv->id); ret = -EINVAL; - goto error1; - } - - if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto error1; - - cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, - msg_response, service_timeout, - private_data, private_data_len); - ret = ib_post_send_mad(msg, NULL); - if (ret) - goto error2; + goto error_unlock; } cm_id->state = cm_state; cm_id->lap_state = lap_state; - cm_id_priv->service_timeout = service_timeout; - cm_set_private_data(cm_id_priv, data, private_data_len); - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return 0; - -error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); - return ret; + cm_set_private_data(cm_id_priv, NULL, 0); -error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - kfree(data); - cm_free_msg(msg); +error_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } -EXPORT_SYMBOL(ib_send_cm_mra); +EXPORT_SYMBOL(ib_prepare_cm_mra); -static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) +static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { - switch (cm_mra_get_msg_mraed(mra_msg)) { + switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) { case CM_MSG_RESPONSE_REQ: - return cm_acquire_id(mra_msg->remote_comm_id, 0); + return cm_acquire_id( + cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), + 0); case CM_MSG_RESPONSE_REP: case CM_MSG_RESPONSE_OTHER: - return cm_acquire_id(mra_msg->remote_comm_id, - mra_msg->local_comm_id); + return cm_acquire_id( + cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), + cpu_to_be32(IBA_GET(CM_MRA_LOCAL_COMM_ID, mra_msg))); default: return NULL; } @@ -3011,71 +3172,62 @@ static int cm_mra_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_mra_msg *mra_msg; - int timeout, ret; + int timeout; mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_mraed_id(mra_msg); if (!cm_id_priv) return -EINVAL; - work->cm_event.private_data = &mra_msg->private_data; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_MRA_PRIVATE_DATA, mra_msg); work->cm_event.param.mra_rcvd.service_timeout = - cm_mra_get_service_timeout(mra_msg); - timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) + + IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg); + timeout = cm_convert_to_ms(IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg)) + cm_convert_to_ms(cm_id_priv->av.timeout); spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { case IB_CM_REQ_SENT: - if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ || - ib_modify_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg, timeout)) + if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != + CM_MSG_RESPONSE_REQ || + ib_modify_mad(cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; break; case IB_CM_REP_SENT: - if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP || - ib_modify_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg, timeout)) + if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != + CM_MSG_RESPONSE_REP || + ib_modify_mad(cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REP_RCVD; break; case IB_CM_ESTABLISHED: - if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER || + if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != + CM_MSG_RESPONSE_OTHER || cm_id_priv->id.lap_state != IB_CM_LAP_SENT || - ib_modify_mad(cm_id_priv->av.port->mad_agent, - cm_id_priv->msg, timeout)) { + ib_modify_mad(cm_id_priv->msg, timeout)) { if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) - atomic_long_inc(&work->port-> - counter_group[CM_RECV_DUPLICATES]. - counter[CM_MRA_COUNTER]); + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES] + [CM_MRA_COUNTER]); goto out; } cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; break; case IB_CM_MRA_REQ_RCVD: case IB_CM_MRA_REP_RCVD: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_MRA_COUNTER]); - /* fall through */ + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_MRA_COUNTER]); + fallthrough; default: - pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_mra_unknown_err(&cm_id_priv->id); goto out; } cm_id_priv->msg->context[1] = (void *) (unsigned long) cm_id_priv->id.state; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: spin_unlock_irq(&cm_id_priv->lock); @@ -3083,117 +3235,23 @@ out: return -EINVAL; } -static void cm_format_lap(struct cm_lap_msg *lap_msg, - struct cm_id_private *cm_id_priv, - struct sa_path_rec *alternate_path, - const void *private_data, - u8 private_data_len) -{ - bool alt_ext = false; - - if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA) - alt_ext = opa_is_extended_lid(alternate_path->opa.dlid, - alternate_path->opa.slid); - cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, - cm_form_tid(cm_id_priv)); - lap_msg->local_comm_id = cm_id_priv->id.local_id; - lap_msg->remote_comm_id = cm_id_priv->id.remote_id; - cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn); - /* todo: need remote CM response timeout */ - cm_lap_set_remote_resp_timeout(lap_msg, 0x1F); - lap_msg->alt_local_lid = - htons(ntohl(sa_path_get_slid(alternate_path))); - lap_msg->alt_remote_lid = - htons(ntohl(sa_path_get_dlid(alternate_path))); - lap_msg->alt_local_gid = alternate_path->sgid; - lap_msg->alt_remote_gid = alternate_path->dgid; - if (alt_ext) { - lap_msg->alt_local_gid.global.interface_id - = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid)); - lap_msg->alt_remote_gid.global.interface_id - = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid)); - } - cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); - cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); - lap_msg->alt_hop_limit = alternate_path->hop_limit; - cm_lap_set_packet_rate(lap_msg, alternate_path->rate); - cm_lap_set_sl(lap_msg, alternate_path->sl); - cm_lap_set_subnet_local(lap_msg, 1); /* local only... */ - cm_lap_set_local_ack_timeout(lap_msg, - cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, - alternate_path->packet_life_time)); - - if (private_data && private_data_len) - memcpy(lap_msg->private_data, private_data, private_data_len); -} - -int ib_send_cm_lap(struct ib_cm_id *cm_id, - struct sa_path_rec *alternate_path, - const void *private_data, - u8 private_data_len) -{ - struct cm_id_private *cm_id_priv; - struct ib_mad_send_buf *msg; - unsigned long flags; - int ret; - - if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE) - return -EINVAL; - - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_ESTABLISHED || - (cm_id->lap_state != IB_CM_LAP_UNINIT && - cm_id->lap_state != IB_CM_LAP_IDLE)) { - ret = -EINVAL; - goto out; - } - - ret = cm_init_av_by_path(alternate_path, NULL, &cm_id_priv->alt_av, - cm_id_priv); - if (ret) - goto out; - cm_id_priv->alt_av.timeout = - cm_ack_timeout(cm_id_priv->target_ack_delay, - cm_id_priv->alt_av.timeout - 1); - - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto out; - - cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv, - alternate_path, private_data, private_data_len); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED; - - ret = ib_post_send_mad(msg, NULL); - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); - return ret; - } - - cm_id->lap_state = IB_CM_LAP_SENT; - cm_id_priv->msg = msg; - -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_send_cm_lap); - static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg, struct sa_path_rec *path) { u32 lid; if (path->rec_type != SA_PATH_REC_TYPE_OPA) { - sa_path_set_dlid(path, ntohs(lap_msg->alt_local_lid)); - sa_path_set_slid(path, ntohs(lap_msg->alt_remote_lid)); + sa_path_set_dlid(path, IBA_GET(CM_LAP_ALTERNATE_LOCAL_PORT_LID, + lap_msg)); + sa_path_set_slid(path, IBA_GET(CM_LAP_ALTERNATE_REMOTE_PORT_LID, + lap_msg)); } else { - lid = opa_get_lid_from_gid(&lap_msg->alt_local_gid); + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg)); sa_path_set_dlid(path, lid); - lid = opa_get_lid_from_gid(&lap_msg->alt_remote_gid); + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg)); sa_path_set_slid(path, lid); } } @@ -3202,20 +3260,23 @@ static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { - path->dgid = lap_msg->alt_local_gid; - path->sgid = lap_msg->alt_remote_gid; - path->flow_label = cm_lap_get_flow_label(lap_msg); - path->hop_limit = lap_msg->alt_hop_limit; - path->traffic_class = cm_lap_get_traffic_class(lap_msg); + path->dgid = *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg); + path->sgid = + *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg); + path->flow_label = + cpu_to_be32(IBA_GET(CM_LAP_ALTERNATE_FLOW_LABEL, lap_msg)); + path->hop_limit = IBA_GET(CM_LAP_ALTERNATE_HOP_LIMIT, lap_msg); + path->traffic_class = IBA_GET(CM_LAP_ALTERNATE_TRAFFIC_CLASS, lap_msg); path->reversible = 1; path->pkey = cm_id_priv->pkey; - path->sl = cm_lap_get_sl(lap_msg); + path->sl = IBA_GET(CM_LAP_ALTERNATE_SL, lap_msg); path->mtu_selector = IB_SA_EQ; path->mtu = cm_id_priv->path_mtu; path->rate_selector = IB_SA_EQ; - path->rate = cm_lap_get_packet_rate(lap_msg); + path->rate = IBA_GET(CM_LAP_ALTERNATE_PACKET_RATE, lap_msg); path->packet_life_time_selector = IB_SA_EQ; - path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg); + path->packet_life_time = + IBA_GET(CM_LAP_ALTERNATE_LOCAL_ACK_TIMEOUT, lap_msg); path->packet_life_time -= (path->packet_life_time > 0); cm_format_path_lid_from_lap(lap_msg, path); } @@ -3226,6 +3287,8 @@ static int cm_lap_handler(struct cm_work *work) struct cm_lap_msg *lap_msg; struct ib_cm_lap_event_param *param; struct ib_mad_send_buf *msg = NULL; + struct rdma_ah_attr ah_attr; + struct cm_av alt_av = {}; int ret; /* Currently Alternate path messages are not supported for @@ -3237,22 +3300,42 @@ static int cm_lap_handler(struct cm_work *work) /* todo: verify LAP request and send reject APR if invalid. */ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id, - lap_msg->local_comm_id); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_LAP_REMOTE_COMM_ID, lap_msg)), + cpu_to_be32(IBA_GET(CM_LAP_LOCAL_COMM_ID, lap_msg))); if (!cm_id_priv) return -EINVAL; param = &work->cm_event.param.lap_rcvd; memset(&work->path[0], 0, sizeof(work->path[1])); cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &lap_msg->alt_local_gid); + work->port->port_num, &work->path[0], + IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, + lap_msg)); param->alternate_path = &work->path[0]; cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); - work->cm_event.private_data = &lap_msg->private_data; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_LAP_PRIVATE_DATA, lap_msg); + + ret = ib_init_ah_attr_from_wc(work->port->cm_dev->ib_device, + work->port->port_num, + work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &ah_attr); + if (ret) + goto deref; + + ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av); + if (ret) { + rdma_destroy_ah_attr(&ah_attr); + goto deref; + } spin_lock_irq(&cm_id_priv->lock); + cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, + &ah_attr, &cm_id_priv->av); + cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) goto unlock; @@ -3261,15 +3344,15 @@ static int cm_lap_handler(struct cm_work *work) case IB_CM_LAP_IDLE: break; case IB_CM_MRA_LAP_SENT: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_LAP_COUNTER]); - msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_LAP_COUNTER]); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc, + true); if (IS_ERR(msg)) goto unlock; cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, CM_MSG_RESPONSE_OTHER, - cm_id_priv->service_timeout, cm_id_priv->private_data, cm_id_priv->private_data_len); spin_unlock_irq(&cm_id_priv->lock); @@ -3279,35 +3362,16 @@ static int cm_lap_handler(struct cm_work *work) cm_free_msg(msg); goto deref; case IB_CM_LAP_RCVD: - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_LAP_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_LAP_COUNTER]); goto unlock; default: goto unlock; } - ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); - if (ret) - goto unlock; - - ret = cm_init_av_by_path(param->alternate_path, NULL, - &cm_id_priv->alt_av, cm_id_priv); - if (ret) - goto unlock; - cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); @@ -3315,77 +3379,10 @@ deref: cm_deref_id(cm_id_priv); return -EINVAL; } -static void cm_format_apr(struct cm_apr_msg *apr_msg, - struct cm_id_private *cm_id_priv, - enum ib_cm_apr_status status, - void *info, - u8 info_length, - const void *private_data, - u8 private_data_len) -{ - cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid); - apr_msg->local_comm_id = cm_id_priv->id.local_id; - apr_msg->remote_comm_id = cm_id_priv->id.remote_id; - apr_msg->ap_status = (u8) status; - - if (info && info_length) { - apr_msg->info_length = info_length; - memcpy(apr_msg->info, info, info_length); - } - - if (private_data && private_data_len) - memcpy(apr_msg->private_data, private_data, private_data_len); -} - -int ib_send_cm_apr(struct ib_cm_id *cm_id, - enum ib_cm_apr_status status, - void *info, - u8 info_length, - const void *private_data, - u8 private_data_len) -{ - struct cm_id_private *cm_id_priv; - struct ib_mad_send_buf *msg; - unsigned long flags; - int ret; - - if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) || - (info && info_length > IB_CM_APR_INFO_LENGTH)) - return -EINVAL; - - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_ESTABLISHED || - (cm_id->lap_state != IB_CM_LAP_RCVD && - cm_id->lap_state != IB_CM_MRA_LAP_SENT)) { - ret = -EINVAL; - goto out; - } - - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto out; - - cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status, - info, info_length, private_data, private_data_len); - ret = ib_post_send_mad(msg, NULL); - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); - return ret; - } - - cm_id->lap_state = IB_CM_LAP_IDLE; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_send_cm_apr); - static int cm_apr_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_apr_msg *apr_msg; - int ret; /* Currently Alternate path messages are not supported for * RoCE link layer. @@ -3395,15 +3392,20 @@ static int cm_apr_handler(struct cm_work *work) return -EINVAL; apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id, - apr_msg->local_comm_id); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_APR_REMOTE_COMM_ID, apr_msg)), + cpu_to_be32(IBA_GET(CM_APR_LOCAL_COMM_ID, apr_msg))); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ - work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status; - work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info; - work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length; - work->cm_event.private_data = &apr_msg->private_data; + work->cm_event.param.apr_rcvd.ap_status = + IBA_GET(CM_APR_AR_STATUS, apr_msg); + work->cm_event.param.apr_rcvd.apr_info = + IBA_GET_MEM_PTR(CM_APR_ADDITIONAL_INFORMATION, apr_msg); + work->cm_event.param.apr_rcvd.info_len = + IBA_GET(CM_APR_ADDITIONAL_INFORMATION_LENGTH, apr_msg); + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_APR_PRIVATE_DATA, apr_msg); spin_lock_irq(&cm_id_priv->lock); if (cm_id_priv->id.state != IB_CM_ESTABLISHED || @@ -3413,18 +3415,8 @@ static int cm_apr_handler(struct cm_work *work) goto out; } cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - cm_id_priv->msg = NULL; - - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3435,9 +3427,8 @@ static int cm_timewait_handler(struct cm_work *work) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; - int ret; - timewait_info = (struct cm_timewait_info *)work; + timewait_info = container_of(work, struct cm_timewait_info, work); spin_lock_irq(&cm.lock); list_del(&timewait_info->list); spin_unlock_irq(&cm.lock); @@ -3454,15 +3445,7 @@ static int cm_timewait_handler(struct cm_work *work) goto out; } cm_id_priv->id.state = IB_CM_IDLE; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3475,13 +3458,16 @@ static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, { cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, cm_form_tid(cm_id_priv)); - sidr_req_msg->request_id = cm_id_priv->id.local_id; - sidr_req_msg->pkey = param->path->pkey; - sidr_req_msg->service_id = param->service_id; + IBA_SET(CM_SIDR_REQ_REQUESTID, sidr_req_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg, + be16_to_cpu(param->path->pkey)); + IBA_SET(CM_SIDR_REQ_SERVICEID, sidr_req_msg, + be64_to_cpu(param->service_id)); if (param->private_data && param->private_data_len) - memcpy(sidr_req_msg->private_data, param->private_data, - param->private_data_len); + IBA_SET_MEM(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg, + param->private_data, param->private_data_len); } int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, @@ -3489,6 +3475,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, { struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; + struct cm_av av = {}; unsigned long flags; int ret; @@ -3497,40 +3484,40 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - ret = cm_init_av_by_path(param->path, param->sgid_attr, - &cm_id_priv->av, - cm_id_priv); + ret = cm_init_av_by_path(param->path, param->sgid_attr, &av); if (ret) - goto out; + return ret; + spin_lock_irqsave(&cm_id_priv->lock, flags); + cm_move_av_from_path(&cm_id_priv->av, &av); cm_id->service_id = param->service_id; - cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = param->timeout_ms; cm_id_priv->max_cm_retries = param->max_cm_retries; - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto out; - - cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv, - param); - msg->timeout_ms = cm_id_priv->timeout_ms; - msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; - - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state == IB_CM_IDLE) - ret = ib_post_send_mad(msg, NULL); - else + if (cm_id->state != IB_CM_IDLE) { ret = -EINVAL; + goto out_unlock; + } - if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_free_msg(msg); - goto out; + msg = cm_alloc_priv_msg(cm_id_priv, IB_CM_SIDR_REQ_SENT); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out_unlock; } + + cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv, + param); + + trace_icm_send_sidr_req(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; cm_id->state = IB_CM_SIDR_REQ_SENT; - cm_id_priv->msg = msg; spin_unlock_irqrestore(&cm_id_priv->lock, flags); -out: + return 0; +out_free: + cm_free_priv_msg(msg); +out_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); @@ -3545,72 +3532,86 @@ static void cm_format_sidr_req_event(struct cm_work *work, sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_req_rcvd; - param->pkey = __be16_to_cpu(sidr_req_msg->pkey); + param->pkey = IBA_GET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg); param->listen_id = listen_id; - param->service_id = sidr_req_msg->service_id; + param->service_id = + cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); param->bth_pkey = cm_get_bth_pkey(work); param->port = work->port->port_num; param->sgid_attr = rx_cm_id->av.ah_attr.grh.sgid_attr; - work->cm_event.private_data = &sidr_req_msg->private_data; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg); } static int cm_sidr_req_handler(struct cm_work *work) { - struct ib_cm_id *cm_id; - struct cm_id_private *cm_id_priv, *cur_cm_id_priv; + struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; int ret; - cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); - if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); - cm_id_priv = container_of(cm_id, struct cm_id_private, id); + cm_id_priv = + cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id_priv)) + return PTR_ERR(cm_id_priv); /* Record SGID/SLID and request ID for lookup. */ sidr_req_msg = (struct cm_sidr_req_msg *) work->mad_recv_wc->recv_buf.mad; + + cm_id_priv->id.remote_id = + cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg)); + cm_id_priv->id.service_id = + cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); + cm_id_priv->tid = sidr_req_msg->hdr.tid; + wc = work->mad_recv_wc->wc; - cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); - cm_id_priv->av.dgid.global.interface_id = 0; + cm_id_priv->sidr_slid = wc->slid; ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); if (ret) goto out; - cm_id_priv->id.remote_id = sidr_req_msg->request_id; - cm_id_priv->tid = sidr_req_msg->hdr.tid; - atomic_inc(&cm_id_priv->work_count); - spin_lock_irq(&cm.lock); - cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); - if (cur_cm_id_priv) { + listen_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); + if (listen_cm_id_priv) { spin_unlock_irq(&cm.lock); - atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. - counter[CM_SIDR_REQ_COUNTER]); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_SIDR_REQ_COUNTER]); goto out; /* Duplicate message. */ } cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; - cur_cm_id_priv = cm_find_listen(cm_id->device, - sidr_req_msg->service_id); - if (!cur_cm_id_priv) { + listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, + cm_id_priv->id.service_id); + if (!listen_cm_id_priv) { spin_unlock_irq(&cm.lock); - cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); + ib_send_cm_sidr_rep(&cm_id_priv->id, + &(struct ib_cm_sidr_rep_param){ + .status = IB_SIDR_UNSUPPORTED }); goto out; /* No match. */ } - atomic_inc(&cur_cm_id_priv->refcount); - atomic_inc(&cm_id_priv->refcount); spin_unlock_irq(&cm.lock); - cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler; - cm_id_priv->id.context = cur_cm_id_priv->id.context; - cm_id_priv->id.service_id = sidr_req_msg->service_id; - cm_id_priv->id.service_mask = ~cpu_to_be64(0); + cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = listen_cm_id_priv->id.context; - cm_format_sidr_req_event(work, cm_id_priv, &cur_cm_id_priv->id); - cm_process_work(cm_id_priv, work); - cm_deref_id(cur_cm_id_priv); + /* + * A SIDR ID does not need to be in the xarray since it does not receive + * mads, is not placed in the remote_id or remote_qpn rbtree, and does + * not enter timewait. + */ + + cm_format_sidr_req_event(work, cm_id_priv, &listen_cm_id_priv->id); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); + cm_free_work(work); + /* + * A pointer to the listen_cm_id is held in the event, so this deref + * must be after the event is delivered above. + */ + cm_deref_id(listen_cm_id_priv); + if (ret) + cm_destroy_id(&cm_id_priv->id, ret); return 0; out: ib_destroy_cm_id(&cm_id_priv->id); @@ -3621,57 +3622,59 @@ static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param) { - cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, - cm_id_priv->tid); - sidr_rep_msg->request_id = cm_id_priv->id.remote_id; - sidr_rep_msg->status = param->status; - cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num)); - sidr_rep_msg->service_id = cm_id_priv->id.service_id; - sidr_rep_msg->qkey = cpu_to_be32(param->qkey); + cm_format_mad_ece_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, + cm_id_priv->tid, param->ece.attr_mod); + IBA_SET(CM_SIDR_REP_REQUESTID, sidr_rep_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_SIDR_REP_STATUS, sidr_rep_msg, param->status); + IBA_SET(CM_SIDR_REP_QPN, sidr_rep_msg, param->qp_num); + IBA_SET(CM_SIDR_REP_SERVICEID, sidr_rep_msg, + be64_to_cpu(cm_id_priv->id.service_id)); + IBA_SET(CM_SIDR_REP_Q_KEY, sidr_rep_msg, param->qkey); + IBA_SET(CM_SIDR_REP_VENDOR_ID_L, sidr_rep_msg, + param->ece.vendor_id & 0xFF); + IBA_SET(CM_SIDR_REP_VENDOR_ID_H, sidr_rep_msg, + (param->ece.vendor_id >> 8) & 0xFF); if (param->info && param->info_length) - memcpy(sidr_rep_msg->info, param->info, param->info_length); + IBA_SET_MEM(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg, + param->info, param->info_length); if (param->private_data && param->private_data_len) - memcpy(sidr_rep_msg->private_data, param->private_data, - param->private_data_len); + IBA_SET_MEM(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg, + param->private_data, param->private_data_len); } -int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, - struct ib_cm_sidr_rep_param *param) +static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param) { - struct cm_id_private *cm_id_priv; struct ib_mad_send_buf *msg; unsigned long flags; int ret; + lockdep_assert_held(&cm_id_priv->lock); + if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) || (param->private_data && param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE)) return -EINVAL; - cm_id_priv = container_of(cm_id, struct cm_id_private, id); - spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id->state != IB_CM_SIDR_REQ_RCVD) { - ret = -EINVAL; - goto error; - } + if (cm_id_priv->id.state != IB_CM_SIDR_REQ_RCVD) + return -EINVAL; - ret = cm_alloc_msg(cm_id_priv, &msg); - if (ret) - goto error; + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, param); + trace_icm_send_sidr_rep(&cm_id_priv->id); ret = ib_post_send_mad(msg, NULL); if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); return ret; } - cm_id->state = IB_CM_IDLE; - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - + cm_id_priv->id.state = IB_CM_IDLE; spin_lock_irqsave(&cm.lock, flags); if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); @@ -3679,8 +3682,19 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, } spin_unlock_irqrestore(&cm.lock, flags); return 0; +} -error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, + struct ib_cm_sidr_rep_param *param) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_sidr_rep_locked(cm_id_priv, param); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_rep); @@ -3694,13 +3708,16 @@ static void cm_format_sidr_rep_event(struct cm_work *work, sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; param = &work->cm_event.param.sidr_rep_rcvd; - param->status = sidr_rep_msg->status; - param->qkey = be32_to_cpu(sidr_rep_msg->qkey); - param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg)); - param->info = &sidr_rep_msg->info; - param->info_len = sidr_rep_msg->info_length; + param->status = IBA_GET(CM_SIDR_REP_STATUS, sidr_rep_msg); + param->qkey = IBA_GET(CM_SIDR_REP_Q_KEY, sidr_rep_msg); + param->qpn = IBA_GET(CM_SIDR_REP_QPN, sidr_rep_msg); + param->info = IBA_GET_MEM_PTR(CM_SIDR_REP_ADDITIONAL_INFORMATION, + sidr_rep_msg); + param->info_len = IBA_GET(CM_SIDR_REP_ADDITIONAL_INFORMATION_LENGTH, + sidr_rep_msg); param->sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; - work->cm_event.private_data = &sidr_rep_msg->private_data; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg); } static int cm_sidr_rep_handler(struct cm_work *work) @@ -3710,7 +3727,8 @@ static int cm_sidr_rep_handler(struct cm_work *work) sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0); + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg)), 0); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */ @@ -3720,7 +3738,7 @@ static int cm_sidr_rep_handler(struct cm_work *work) goto out; } cm_id_priv->id.state = IB_CM_IDLE; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); + ib_cancel_mad(cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); cm_format_sidr_rep_event(work, cm_id_priv); @@ -3731,25 +3749,29 @@ out: return -EINVAL; } -static void cm_process_send_error(struct ib_mad_send_buf *msg, +static void cm_process_send_error(struct cm_id_private *cm_id_priv, + struct ib_mad_send_buf *msg, enum ib_wc_status wc_status) { - struct cm_id_private *cm_id_priv; - struct ib_cm_event cm_event; - enum ib_cm_state state; + enum ib_cm_state state = (unsigned long) msg->context[1]; + struct ib_cm_event cm_event = {}; int ret; - memset(&cm_event, 0, sizeof cm_event); - cm_id_priv = msg->context[0]; - - /* Discard old sends or ones without a response. */ + /* Discard old sends. */ spin_lock_irq(&cm_id_priv->lock); - state = (enum ib_cm_state) (unsigned long) msg->context[1]; - if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) - goto discard; + if (msg != cm_id_priv->msg) { + spin_unlock_irq(&cm_id_priv->lock); + cm_free_msg(msg); + cm_deref_id(cm_id_priv); + return; + } + cm_free_priv_msg(msg); + + if (state != cm_id_priv->id.state || wc_status == IB_WC_SUCCESS || + wc_status == IB_WC_WR_FLUSH_ERR) + goto out_unlock; - pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n", - state, ib_wc_status_msg(wc_status)); + trace_icm_mad_send_err(state, wc_status); switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: @@ -3770,26 +3792,25 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg, cm_event.event = IB_CM_SIDR_REQ_ERROR; break; default: - goto discard; + goto out_unlock; } spin_unlock_irq(&cm_id_priv->lock); cm_event.param.send_status = wc_status; /* No other events can occur on the cm_id at this point. */ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event); - cm_free_msg(msg); if (ret) ib_destroy_cm_id(&cm_id_priv->id); return; -discard: +out_unlock: spin_unlock_irq(&cm_id_priv->lock); - cm_free_msg(msg); } static void cm_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_mad_send_buf *msg = mad_send_wc->send_buf; + struct cm_id_private *cm_id_priv; struct cm_port *port; u16 attr_index; @@ -3797,33 +3818,22 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent, attr_index = be16_to_cpu(((struct ib_mad_hdr *) msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; - /* - * If the send was in response to a received message (context[0] is not - * set to a cm_id), and is not a REJ, then it is a send that was - * manually retried. - */ - if (!msg->context[0] && (attr_index != CM_REJ_COUNTER)) + if (msg->context[0] == CM_DIRECT_RETRY_CTX) { msg->retries = 1; + cm_id_priv = NULL; + } else { + cm_id_priv = msg->context[0]; + } - atomic_long_add(1 + msg->retries, - &port->counter_group[CM_XMIT].counter[attr_index]); + atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]); if (msg->retries) atomic_long_add(msg->retries, - &port->counter_group[CM_XMIT_RETRIES]. - counter[attr_index]); + &port->counters[CM_XMIT_RETRIES][attr_index]); - switch (mad_send_wc->status) { - case IB_WC_SUCCESS: - case IB_WC_WR_FLUSH_ERR: + if (cm_id_priv) + cm_process_send_error(cm_id_priv, msg, mad_send_wc->status); + else cm_free_msg(msg); - break; - default: - if (msg->context[0] && msg->context[1]) - cm_process_send_error(msg, mad_send_wc->status); - else - cm_free_msg(msg); - break; - } } static void cm_work_handler(struct work_struct *_work) @@ -3872,7 +3882,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: - pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); + trace_icm_handler_err(work->cm_event.event); ret = -EINVAL; break; } @@ -3898,8 +3908,7 @@ static int cm_establish(struct ib_cm_id *cm_id) cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); - switch (cm_id->state) - { + switch (cm_id->state) { case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: cm_id->state = IB_CM_ESTABLISHED; @@ -3908,8 +3917,7 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: - pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, - be32_to_cpu(cm_id->local_id), cm_id->state); + trace_icm_establish_err(cm_id); ret = -EINVAL; break; } @@ -3949,9 +3957,7 @@ out: static int cm_migrate(struct ib_cm_id *cm_id) { struct cm_id_private *cm_id_priv; - struct cm_av tmp_av; unsigned long flags; - int tmp_send_port_not_ready; int ret = 0; cm_id_priv = container_of(cm_id, struct cm_id_private, id); @@ -3960,14 +3966,7 @@ static int cm_migrate(struct ib_cm_id *cm_id) (cm_id->lap_state == IB_CM_LAP_UNINIT || cm_id->lap_state == IB_CM_LAP_IDLE)) { cm_id->lap_state = IB_CM_LAP_IDLE; - /* Swap address vector */ - tmp_av = cm_id_priv->av; cm_id_priv->av = cm_id_priv->alt_av; - cm_id_priv->alt_av = tmp_av; - /* Swap port send ready state */ - tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready; - cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready; - cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready; } else ret = -EINVAL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -4049,11 +4048,9 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, } attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); - atomic_long_inc(&port->counter_group[CM_RECV]. - counter[attr_id - CM_ATTR_ID_OFFSET]); + atomic_long_inc(&port->counters[CM_RECV][attr_id - CM_ATTR_ID_OFFSET]); - work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths, - GFP_KERNEL); + work = kmalloc(struct_size(work, path, paths), GFP_KERNEL); if (!work) { ib_free_recv_mad(mad_recv_wc); return; @@ -4099,17 +4096,25 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; - if (cm_id_priv->responder_resources) + if (cm_id_priv->responder_resources) { + struct ib_device *ib_dev = cm_id_priv->id.device; + u64 support_flush = ib_dev->attrs.device_cap_flags & + (IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT); + u32 flushable = support_flush ? + (IB_ACCESS_FLUSH_GLOBAL | + IB_ACCESS_FLUSH_PERSISTENT) : 0; + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_ATOMIC; + IB_ACCESS_REMOTE_ATOMIC | + flushable; + } qp_attr->pkey_index = cm_id_priv->av.pkey_index; - qp_attr->port_num = cm_id_priv->av.port->port_num; + if (cm_id_priv->av.port) + qp_attr->port_num = cm_id_priv->av.port->port_num; ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_init_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4136,6 +4141,10 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; + if ((qp_attr->ah_attr.type == RDMA_AH_ATTR_TYPE_IB) && + cm_id_priv->av.dlid_datapath && + (cm_id_priv->av.dlid_datapath != 0xffff)) + qp_attr->ah_attr.ib.dlid = cm_id_priv->av.dlid_datapath; qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); @@ -4147,7 +4156,8 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, cm_id_priv->responder_resources; qp_attr->min_rnr_timer = 0; } - if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) { + if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr) && + cm_id_priv->alt_av.port) { *qp_attr_mask |= IB_QP_ALT_PATH; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; @@ -4157,9 +4167,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_rtr_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4196,7 +4204,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, qp_attr->retry_cnt = cm_id_priv->retry_count; qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; - /* fall through */ + fallthrough; case IB_QPT_XRC_TGT: *qp_attr_mask |= IB_QP_TIMEOUT; qp_attr->timeout = cm_id_priv->av.timeout; @@ -4210,7 +4218,9 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, } } else { *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; - qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; + if (cm_id_priv->alt_av.port) + qp_attr->alt_port_num = + cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; @@ -4219,9 +4229,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: - pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", - __func__, be32_to_cpu(cm_id_priv->id.local_id), - cm_id_priv->id.state); + trace_icm_qp_rts_err(&cm_id_priv->id); ret = -EINVAL; break; } @@ -4255,96 +4263,76 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, } EXPORT_SYMBOL(ib_cm_init_qp_attr); -static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, - char *buf) +static ssize_t cm_show_counter(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { - struct cm_counter_group *group; - struct cm_counter_attribute *cm_attr; - - group = container_of(obj, struct cm_counter_group, obj); - cm_attr = container_of(attr, struct cm_counter_attribute, attr); + struct cm_counter_attribute *cm_attr = + container_of(attr, struct cm_counter_attribute, attr); + struct cm_device *cm_dev = ib_get_client_data(ibdev, &cm_client); - return sprintf(buf, "%ld\n", - atomic_long_read(&group->counter[cm_attr->index])); -} - -static const struct sysfs_ops cm_counter_ops = { - .show = cm_show_counter -}; - -static struct kobj_type cm_counter_obj_type = { - .sysfs_ops = &cm_counter_ops, - .default_attrs = cm_counter_default_attrs -}; - -static void cm_release_port_obj(struct kobject *obj) -{ - struct cm_port *cm_port; - - cm_port = container_of(obj, struct cm_port, port_obj); - kfree(cm_port); -} - -static struct kobj_type cm_port_obj_type = { - .release = cm_release_port_obj -}; + if (WARN_ON(!cm_dev)) + return -EINVAL; -static char *cm_devnode(struct device *dev, umode_t *mode) -{ - if (mode) - *mode = 0666; - return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); + return sysfs_emit( + buf, "%ld\n", + atomic_long_read( + &cm_dev->port[port_num - 1] + ->counters[cm_attr->group][cm_attr->index])); } -struct class cm_class = { - .owner = THIS_MODULE, - .name = "infiniband_cm", - .devnode = cm_devnode, -}; -EXPORT_SYMBOL(cm_class); - -static int cm_create_port_fs(struct cm_port *port) -{ - int i, ret; - - ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type, - &port->cm_dev->device->kobj, - "%d", port->port_num); - if (ret) { - kfree(port); - return ret; - } - - for (i = 0; i < CM_COUNTER_GROUPS; i++) { - ret = kobject_init_and_add(&port->counter_group[i].obj, - &cm_counter_obj_type, - &port->port_obj, - "%s", counter_group_names[i]); - if (ret) - goto error; +#define CM_COUNTER_ATTR(_name, _group, _index) \ + { \ + .attr = __ATTR(_name, 0444, cm_show_counter, NULL), \ + .group = _group, .index = _index \ } - return 0; - -error: - while (i--) - kobject_put(&port->counter_group[i].obj); - kobject_put(&port->port_obj); - return ret; - -} - -static void cm_remove_port_fs(struct cm_port *port) -{ - int i; - - for (i = 0; i < CM_COUNTER_GROUPS; i++) - kobject_put(&port->counter_group[i].obj); +#define CM_COUNTER_GROUP(_group, _name) \ + static struct cm_counter_attribute cm_counter_attr_##_group[] = { \ + CM_COUNTER_ATTR(req, _group, CM_REQ_COUNTER), \ + CM_COUNTER_ATTR(mra, _group, CM_MRA_COUNTER), \ + CM_COUNTER_ATTR(rej, _group, CM_REJ_COUNTER), \ + CM_COUNTER_ATTR(rep, _group, CM_REP_COUNTER), \ + CM_COUNTER_ATTR(rtu, _group, CM_RTU_COUNTER), \ + CM_COUNTER_ATTR(dreq, _group, CM_DREQ_COUNTER), \ + CM_COUNTER_ATTR(drep, _group, CM_DREP_COUNTER), \ + CM_COUNTER_ATTR(sidr_req, _group, CM_SIDR_REQ_COUNTER), \ + CM_COUNTER_ATTR(sidr_rep, _group, CM_SIDR_REP_COUNTER), \ + CM_COUNTER_ATTR(lap, _group, CM_LAP_COUNTER), \ + CM_COUNTER_ATTR(apr, _group, CM_APR_COUNTER), \ + }; \ + static struct attribute *cm_counter_attrs_##_group[] = { \ + &cm_counter_attr_##_group[0].attr.attr, \ + &cm_counter_attr_##_group[1].attr.attr, \ + &cm_counter_attr_##_group[2].attr.attr, \ + &cm_counter_attr_##_group[3].attr.attr, \ + &cm_counter_attr_##_group[4].attr.attr, \ + &cm_counter_attr_##_group[5].attr.attr, \ + &cm_counter_attr_##_group[6].attr.attr, \ + &cm_counter_attr_##_group[7].attr.attr, \ + &cm_counter_attr_##_group[8].attr.attr, \ + &cm_counter_attr_##_group[9].attr.attr, \ + &cm_counter_attr_##_group[10].attr.attr, \ + NULL, \ + }; \ + static const struct attribute_group cm_counter_group_##_group = { \ + .name = _name, \ + .attrs = cm_counter_attrs_##_group, \ + }; - kobject_put(&port->port_obj); -} +CM_COUNTER_GROUP(CM_XMIT, "cm_tx_msgs") +CM_COUNTER_GROUP(CM_XMIT_RETRIES, "cm_tx_retries") +CM_COUNTER_GROUP(CM_RECV, "cm_rx_msgs") +CM_COUNTER_GROUP(CM_RECV_DUPLICATES, "cm_rx_duplicates") + +static const struct attribute_group *cm_counter_groups[] = { + &cm_counter_group_CM_XMIT, + &cm_counter_group_CM_XMIT_RETRIES, + &cm_counter_group_CM_RECV, + &cm_counter_group_CM_RECV_DUPLICATES, + NULL, +}; -static void cm_add_one(struct ib_device *ib_device) +static int cm_add_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; @@ -4358,41 +4346,38 @@ static void cm_add_one(struct ib_device *ib_device) unsigned long flags; int ret; int count = 0; - u8 i; + u32 i; cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt), GFP_KERNEL); if (!cm_dev) - return; + return -ENOMEM; + kref_init(&cm_dev->kref); + rwlock_init(&cm_dev->mad_agent_lock); cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; - cm_dev->device = device_create(&cm_class, &ib_device->dev, - MKDEV(0, 0), NULL, - "%s", dev_name(&ib_device->dev)); - if (IS_ERR(cm_dev->device)) { - kfree(cm_dev); - return; - } + + ib_set_client_data(ib_device, &cm_client, cm_dev); set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); - for (i = 1; i <= ib_device->phys_port_cnt; i++) { + rdma_for_each_port (ib_device, i) { if (!rdma_cap_ib_cm(ib_device, i)) continue; port = kzalloc(sizeof *port, GFP_KERNEL); - if (!port) + if (!port) { + ret = -ENOMEM; goto error1; + } cm_dev->port[i-1] = port; port->cm_dev = cm_dev; port->port_num = i; - INIT_LIST_HEAD(&port->cm_priv_prim_list); - INIT_LIST_HEAD(&port->cm_priv_altr_list); - - ret = cm_create_port_fs(port); + ret = ib_port_register_client_groups(ib_device, i, + cm_counter_groups); if (ret) goto error1; @@ -4404,30 +4389,47 @@ static void cm_add_one(struct ib_device *ib_device) cm_recv_handler, port, 0); - if (IS_ERR(port->mad_agent)) + if (IS_ERR(port->mad_agent)) { + ret = PTR_ERR(port->mad_agent); goto error2; + } + + port->rep_agent = ib_register_mad_agent(ib_device, i, + IB_QPT_GSI, + NULL, + 0, + cm_send_handler, + NULL, + port, + 0); + if (IS_ERR(port->rep_agent)) { + ret = PTR_ERR(port->rep_agent); + goto error3; + } ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) - goto error3; + goto error4; count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; - - ib_set_client_data(ib_device, &cm_client, cm_dev); + } write_lock_irqsave(&cm.device_lock, flags); list_add_tail(&cm_dev->list, &cm.device_list); write_unlock_irqrestore(&cm.device_lock, flags); - return; + return 0; +error4: + ib_unregister_mad_agent(port->rep_agent); error3: ib_unregister_mad_agent(port->mad_agent); error2: - cm_remove_port_fs(port); + ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); error1: port_modify.set_port_cap_mask = 0; port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; @@ -4437,28 +4439,25 @@ error1: port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); + ib_unregister_mad_agent(port->rep_agent); ib_unregister_mad_agent(port->mad_agent); - cm_remove_port_fs(port); + ib_port_unregister_client_groups(ib_device, i, + cm_counter_groups); } free: - device_unregister(cm_dev->device); - kfree(cm_dev); + cm_device_put(cm_dev); + return ret; } static void cm_remove_one(struct ib_device *ib_device, void *client_data) { struct cm_device *cm_dev = client_data; struct cm_port *port; - struct cm_id_private *cm_id_priv; - struct ib_mad_agent *cur_mad_agent; struct ib_port_modify port_modify = { .clr_port_cap_mask = IB_PORT_CM_SUP }; unsigned long flags; - int i; - - if (!cm_dev) - return; + u32 i; write_lock_irqsave(&cm.device_lock, flags); list_del(&cm_dev->list); @@ -4468,62 +4467,57 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) cm_dev->going_down = 1; spin_unlock_irq(&cm.lock); - for (i = 1; i <= ib_device->phys_port_cnt; i++) { + rdma_for_each_port (ib_device, i) { + struct ib_mad_agent *mad_agent; + struct ib_mad_agent *rep_agent; + if (!rdma_cap_ib_cm(ib_device, i)) continue; port = cm_dev->port[i-1]; + mad_agent = port->mad_agent; + rep_agent = port->rep_agent; ib_modify_port(ib_device, port->port_num, 0, &port_modify); - /* Mark all the cm_id's as not valid */ - spin_lock_irq(&cm.lock); - list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list) - cm_id_priv->altr_send_port_not_ready = 1; - list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list) - cm_id_priv->prim_send_port_not_ready = 1; - spin_unlock_irq(&cm.lock); /* * We flush the queue here after the going_down set, this * verify that no new works will be queued in the recv handler, * after that we can call the unregister_mad_agent */ flush_workqueue(cm.wq); - spin_lock_irq(&cm.state_lock); - cur_mad_agent = port->mad_agent; + /* + * The above ensures no call paths from the work are running, + * the remaining paths all take the mad_agent_lock. + */ + write_lock(&cm_dev->mad_agent_lock); port->mad_agent = NULL; - spin_unlock_irq(&cm.state_lock); - ib_unregister_mad_agent(cur_mad_agent); - cm_remove_port_fs(port); + port->rep_agent = NULL; + write_unlock(&cm_dev->mad_agent_lock); + ib_unregister_mad_agent(mad_agent); + ib_unregister_mad_agent(rep_agent); + ib_port_unregister_client_groups(ib_device, i, + cm_counter_groups); } - device_unregister(cm_dev->device); - kfree(cm_dev); + cm_device_put(cm_dev); } static int __init ib_cm_init(void) { int ret; - memset(&cm, 0, sizeof cm); INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); - spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; cm.remote_qp_table = RB_ROOT; cm.remote_sidr_table = RB_ROOT; - idr_init(&cm.local_id_table); + xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); - ret = class_register(&cm_class); - if (ret) { - ret = -ENOMEM; - goto error1; - } - - cm.wq = alloc_workqueue("ib_cm", 0, 1); + cm.wq = alloc_workqueue("ib_cm", WQ_PERCPU, 1); if (!cm.wq) { ret = -ENOMEM; goto error2; @@ -4537,9 +4531,6 @@ static int __init ib_cm_init(void) error3: destroy_workqueue(cm.wq); error2: - class_unregister(&cm_class); -error1: - idr_destroy(&cm.local_id_table); return ret; } @@ -4560,10 +4551,8 @@ static void __exit ib_cm_cleanup(void) kfree(timewait_info); } - class_unregister(&cm_class); - idr_destroy(&cm.local_id_table); + WARN_ON(!xa_empty(&cm.local_id_table)); } module_init(ib_cm_init); module_exit(ib_cm_cleanup); - diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 476d4309576d..8462de7ca26e 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -1,39 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING the madirectory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use source and binary forms, with or - * withmodification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retathe above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE - * SOFTWARE. + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. */ -#if !defined(CM_MSGS_H) +#ifndef CM_MSGS_H #define CM_MSGS_H +#include <rdma/ibta_vol1_c12.h> #include <rdma/ib_mad.h> #include <rdma/ib_cm.h> @@ -44,120 +19,14 @@ #define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */ -struct cm_req_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 rsvd4; - __be64 service_id; - __be64 local_ca_guid; - __be32 rsvd24; - __be32 local_qkey; - /* local QPN:24, responder resources:8 */ - __be32 offset32; - /* local EECN:24, initiator depth:8 */ - __be32 offset36; - /* - * remote EECN:24, remote CM response timeout:5, - * transport service type:2, end-to-end flow control:1 - */ - __be32 offset40; - /* starting PSN:24, local CM response timeout:5, retry count:3 */ - __be32 offset44; - __be16 pkey; - /* path MTU:4, RDC exists:1, RNR retry count:3. */ - u8 offset50; - /* max CM Retries:4, SRQ:1, extended transport type:3 */ - u8 offset51; - - __be16 primary_local_lid; - __be16 primary_remote_lid; - union ib_gid primary_local_gid; - union ib_gid primary_remote_gid; - /* flow label:20, rsvd:6, packet rate:6 */ - __be32 primary_offset88; - u8 primary_traffic_class; - u8 primary_hop_limit; - /* SL:4, subnet local:1, rsvd:3 */ - u8 primary_offset94; - /* local ACK timeout:5, rsvd:3 */ - u8 primary_offset95; - - __be16 alt_local_lid; - __be16 alt_remote_lid; - union ib_gid alt_local_gid; - union ib_gid alt_remote_gid; - /* flow label:20, rsvd:6, packet rate:6 */ - __be32 alt_offset132; - u8 alt_traffic_class; - u8 alt_hop_limit; - /* SL:4, subnet local:1, rsvd:3 */ - u8 alt_offset138; - /* local ACK timeout:5, rsvd:3 */ - u8 alt_offset139; - - u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; - -} __attribute__ ((packed)); - -static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg) -{ - return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8); -} - -static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn) -{ - req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) | - (be32_to_cpu(req_msg->offset32) & - 0x000000FF)); -} - -static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg) -{ - return (u8) be32_to_cpu(req_msg->offset32); -} - -static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res) -{ - req_msg->offset32 = cpu_to_be32(resp_res | - (be32_to_cpu(req_msg->offset32) & - 0xFFFFFF00)); -} - -static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg) -{ - return (u8) be32_to_cpu(req_msg->offset36); -} - -static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg, - u8 init_depth) -{ - req_msg->offset36 = cpu_to_be32(init_depth | - (be32_to_cpu(req_msg->offset36) & - 0xFFFFFF00)); -} - -static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg) -{ - return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3); -} - -static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg, - u8 resp_timeout) -{ - req_msg->offset40 = cpu_to_be32((resp_timeout << 3) | - (be32_to_cpu(req_msg->offset40) & - 0xFFFFFF07)); -} - static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) { - u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1; - switch(transport_type) { + u8 transport_type = IBA_GET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg); + switch (transport_type) { case 0: return IB_QPT_RC; case 1: return IB_QPT_UC; case 3: - switch (req_msg->offset51 & 0x7) { + switch (IBA_GET(CM_REQ_EXTENDED_TRANSPORT_TYPE, req_msg)) { case 1: return IB_QPT_XRC_TGT; default: return 0; } @@ -168,242 +37,19 @@ static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg, enum ib_qp_type qp_type) { - switch(qp_type) { + switch (qp_type) { case IB_QPT_UC: - req_msg->offset40 = cpu_to_be32((be32_to_cpu( - req_msg->offset40) & - 0xFFFFFFF9) | 0x2); + IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 1); break; case IB_QPT_XRC_INI: - req_msg->offset40 = cpu_to_be32((be32_to_cpu( - req_msg->offset40) & - 0xFFFFFFF9) | 0x6); - req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1; + IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 3); + IBA_SET(CM_REQ_EXTENDED_TRANSPORT_TYPE, req_msg, 1); break; default: - req_msg->offset40 = cpu_to_be32(be32_to_cpu( - req_msg->offset40) & - 0xFFFFFFF9); + IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 0); } } -static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg) -{ - return be32_to_cpu(req_msg->offset40) & 0x1; -} - -static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg, - u8 flow_ctrl) -{ - req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) | - (be32_to_cpu(req_msg->offset40) & - 0xFFFFFFFE)); -} - -static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg) -{ - return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8); -} - -static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg, - __be32 starting_psn) -{ - req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) | - (be32_to_cpu(req_msg->offset44) & 0x000000FF)); -} - -static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg) -{ - return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3); -} - -static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg, - u8 resp_timeout) -{ - req_msg->offset44 = cpu_to_be32((resp_timeout << 3) | - (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07)); -} - -static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg) -{ - return (u8) (be32_to_cpu(req_msg->offset44) & 0x7); -} - -static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg, - u8 retry_count) -{ - req_msg->offset44 = cpu_to_be32((retry_count & 0x7) | - (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8)); -} - -static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg) -{ - return req_msg->offset50 >> 4; -} - -static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu) -{ - req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4)); -} - -static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg) -{ - return req_msg->offset50 & 0x7; -} - -static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg, - u8 rnr_retry_count) -{ - req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) | - (rnr_retry_count & 0x7)); -} - -static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg) -{ - return req_msg->offset51 >> 4; -} - -static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg, - u8 retries) -{ - req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4)); -} - -static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg) -{ - return (req_msg->offset51 & 0x8) >> 3; -} - -static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq) -{ - req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) | - ((srq & 0x1) << 3)); -} - -static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg) -{ - return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12); -} - -static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg, - __be32 flow_label) -{ - req_msg->primary_offset88 = cpu_to_be32( - (be32_to_cpu(req_msg->primary_offset88) & - 0x00000FFF) | - (be32_to_cpu(flow_label) << 12)); -} - -static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg) -{ - return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F); -} - -static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg, - u8 rate) -{ - req_msg->primary_offset88 = cpu_to_be32( - (be32_to_cpu(req_msg->primary_offset88) & - 0xFFFFFFC0) | (rate & 0x3F)); -} - -static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg) -{ - return (u8) (req_msg->primary_offset94 >> 4); -} - -static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl) -{ - req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) | - (sl << 4)); -} - -static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg) -{ - return (u8) ((req_msg->primary_offset94 & 0x08) >> 3); -} - -static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg, - u8 subnet_local) -{ - req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) | - ((subnet_local & 0x1) << 3)); -} - -static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg) -{ - return (u8) (req_msg->primary_offset95 >> 3); -} - -static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg, - u8 local_ack_timeout) -{ - req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) | - (local_ack_timeout << 3)); -} - -static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg) -{ - return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12); -} - -static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg, - __be32 flow_label) -{ - req_msg->alt_offset132 = cpu_to_be32( - (be32_to_cpu(req_msg->alt_offset132) & - 0x00000FFF) | - (be32_to_cpu(flow_label) << 12)); -} - -static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg) -{ - return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F); -} - -static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg, - u8 rate) -{ - req_msg->alt_offset132 = cpu_to_be32( - (be32_to_cpu(req_msg->alt_offset132) & - 0xFFFFFFC0) | (rate & 0x3F)); -} - -static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg) -{ - return (u8) (req_msg->alt_offset138 >> 4); -} - -static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl) -{ - req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) | - (sl << 4)); -} - -static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg) -{ - return (u8) ((req_msg->alt_offset138 & 0x08) >> 3); -} - -static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg, - u8 subnet_local) -{ - req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) | - ((subnet_local & 0x1) << 3)); -} - -static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg) -{ - return (u8) (req_msg->alt_offset139 >> 3); -} - -static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg, - u8 local_ack_timeout) -{ - req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) | - (local_ack_timeout << 3)); -} - /* Message REJected or MRAed */ enum cm_msg_response { CM_MSG_RESPONSE_REQ = 0x0, @@ -411,419 +57,12 @@ enum cm_msg_response { CM_MSG_RESPONSE_OTHER = 0x2 }; - struct cm_mra_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - /* message MRAed:2, rsvd:6 */ - u8 offset8; - /* service timeout:5, rsvd:3 */ - u8 offset9; - - u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE]; - -} __attribute__ ((packed)); - -static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg) -{ - return (u8) (mra_msg->offset8 >> 6); -} - -static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg) -{ - mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6)); -} - -static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg) -{ - return (u8) (mra_msg->offset9 >> 3); -} - -static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg, - u8 service_timeout) -{ - mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) | - (service_timeout << 3)); -} - -struct cm_rej_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - /* message REJected:2, rsvd:6 */ - u8 offset8; - /* reject info length:7, rsvd:1. */ - u8 offset9; - __be16 reason; - u8 ari[IB_CM_REJ_ARI_LENGTH]; - - u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE]; - -} __attribute__ ((packed)); - -static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg) -{ - return (u8) (rej_msg->offset8 >> 6); -} - -static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg) -{ - rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6)); -} - -static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg) -{ - return (u8) (rej_msg->offset9 >> 1); -} - -static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg, - u8 len) -{ - rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1)); -} - -struct cm_rep_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - __be32 local_qkey; - /* local QPN:24, rsvd:8 */ - __be32 offset12; - /* local EECN:24, rsvd:8 */ - __be32 offset16; - /* starting PSN:24 rsvd:8 */ - __be32 offset20; - u8 resp_resources; - u8 initiator_depth; - /* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */ - u8 offset26; - /* RNR retry count:3, SRQ:1, rsvd:5 */ - u8 offset27; - __be64 local_ca_guid; - - u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE]; - -} __attribute__ ((packed)); - -static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg) -{ - return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8); -} - -static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn) -{ - rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) | - (be32_to_cpu(rep_msg->offset12) & 0x000000FF)); -} - -static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg) -{ - return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8); -} - -static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn) -{ - rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) | - (be32_to_cpu(rep_msg->offset16) & 0x000000FF)); -} - static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type) { return (qp_type == IB_QPT_XRC_INI) ? - cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg); -} - -static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg) -{ - return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8); -} - -static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg, - __be32 starting_psn) -{ - rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) | - (be32_to_cpu(rep_msg->offset20) & 0x000000FF)); -} - -static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg) -{ - return (u8) (rep_msg->offset26 >> 3); -} - -static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg, - u8 target_ack_delay) -{ - rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) | - (target_ack_delay << 3)); -} - -static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg) -{ - return (u8) ((rep_msg->offset26 & 0x06) >> 1); -} - -static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover) -{ - rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) | - ((failover & 0x3) << 1)); -} - -static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg) -{ - return (u8) (rep_msg->offset26 & 0x01); -} - -static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg, - u8 flow_ctrl) -{ - rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) | - (flow_ctrl & 0x1)); -} - -static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg) -{ - return (u8) (rep_msg->offset27 >> 5); -} - -static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg, - u8 rnr_retry_count) -{ - rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) | - (rnr_retry_count << 5)); -} - -static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg) -{ - return (u8) ((rep_msg->offset27 >> 4) & 0x1); -} - -static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq) -{ - rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) | - ((srq & 0x1) << 4)); -} - -struct cm_rtu_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - - u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE]; - -} __attribute__ ((packed)); - -struct cm_dreq_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - /* remote QPN/EECN:24, rsvd:8 */ - __be32 offset8; - - u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE]; - -} __attribute__ ((packed)); - -static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg) -{ - return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8); -} - -static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn) -{ - dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) | - (be32_to_cpu(dreq_msg->offset8) & 0x000000FF)); -} - -struct cm_drep_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - - u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE]; - -} __attribute__ ((packed)); - -struct cm_lap_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - - __be32 rsvd8; - /* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */ - __be32 offset12; - __be32 rsvd16; - - __be16 alt_local_lid; - __be16 alt_remote_lid; - union ib_gid alt_local_gid; - union ib_gid alt_remote_gid; - /* flow label:20, rsvd:4, traffic class:8 */ - __be32 offset56; - u8 alt_hop_limit; - /* rsvd:2, packet rate:6 */ - u8 offset61; - /* SL:4, subnet local:1, rsvd:3 */ - u8 offset62; - /* local ACK timeout:5, rsvd:3 */ - u8 offset63; - - u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); - -static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg) -{ - return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8); -} - -static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn) -{ - lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) | - (be32_to_cpu(lap_msg->offset12) & - 0x000000FF)); -} - -static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg) -{ - return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3); -} - -static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg, - u8 resp_timeout) -{ - lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) | - (be32_to_cpu(lap_msg->offset12) & - 0xFFFFFF07)); -} - -static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg) -{ - return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12); -} - -static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg, - __be32 flow_label) -{ - lap_msg->offset56 = cpu_to_be32( - (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) | - (be32_to_cpu(flow_label) << 12)); -} - -static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg) -{ - return (u8) be32_to_cpu(lap_msg->offset56); -} - -static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg, - u8 traffic_class) -{ - lap_msg->offset56 = cpu_to_be32(traffic_class | - (be32_to_cpu(lap_msg->offset56) & - 0xFFFFFF00)); -} - -static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg) -{ - return lap_msg->offset61 & 0x3F; -} - -static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg, - u8 packet_rate) -{ - lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0); -} - -static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg) -{ - return lap_msg->offset62 >> 4; -} - -static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl) -{ - lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F); -} - -static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg) -{ - return (lap_msg->offset62 >> 3) & 0x1; -} - -static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg, - u8 subnet_local) -{ - lap_msg->offset62 = ((subnet_local & 0x1) << 3) | - (lap_msg->offset61 & 0xF7); -} -static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg) -{ - return lap_msg->offset63 >> 3; -} - -static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg, - u8 local_ack_timeout) -{ - lap_msg->offset63 = (local_ack_timeout << 3) | - (lap_msg->offset63 & 0x07); -} - -struct cm_apr_msg { - struct ib_mad_hdr hdr; - - __be32 local_comm_id; - __be32 remote_comm_id; - - u8 info_length; - u8 ap_status; - __be16 rsvd; - u8 info[IB_CM_APR_INFO_LENGTH]; - - u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); - -struct cm_sidr_req_msg { - struct ib_mad_hdr hdr; - - __be32 request_id; - __be16 pkey; - __be16 rsvd; - __be64 service_id; - - u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; -} __attribute__ ((packed)); - -struct cm_sidr_rep_msg { - struct ib_mad_hdr hdr; - - __be32 request_id; - u8 status; - u8 info_length; - __be16 rsvd; - /* QPN:24, rsvd:8 */ - __be32 offset8; - __be64 service_id; - __be32 qkey; - u8 info[IB_CM_SIDR_REP_INFO_LENGTH]; - - u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); - -static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg) -{ - return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8); -} - -static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg, - __be32 qpn) -{ - sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) | - (be32_to_cpu(sidr_rep_msg->offset8) & - 0x000000FF)); + cpu_to_be32(IBA_GET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, + rep_msg)) : + cpu_to_be32(IBA_GET(CM_REP_LOCAL_QPN, rep_msg)); } #endif /* CM_MSGS_H */ diff --git a/drivers/infiniband/core/cm_trace.c b/drivers/infiniband/core/cm_trace.c new file mode 100644 index 000000000000..8f3482f66338 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for the IB Connection Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#include <rdma/rdma_cm.h> +#include "cma_priv.h" + +#define CREATE_TRACE_POINTS + +#include "cm_trace.h" diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h new file mode 100644 index 000000000000..4a4987da69d4 --- /dev/null +++ b/drivers/infiniband/core/cm_trace.h @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trace point definitions for the RDMA Connect Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020 Oracle and/or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ib_cma + +#if !defined(_TRACE_IB_CMA_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_IB_CMA_H + +#include <linux/tracepoint.h> +#include <rdma/ib_cm.h> +#include <trace/misc/rdma.h> + +/* + * enum ib_cm_state, from include/rdma/ib_cm.h + */ +#define IB_CM_STATE_LIST \ + ib_cm_state(IDLE) \ + ib_cm_state(LISTEN) \ + ib_cm_state(REQ_SENT) \ + ib_cm_state(REQ_RCVD) \ + ib_cm_state(MRA_REQ_SENT) \ + ib_cm_state(MRA_REQ_RCVD) \ + ib_cm_state(REP_SENT) \ + ib_cm_state(REP_RCVD) \ + ib_cm_state(MRA_REP_SENT) \ + ib_cm_state(MRA_REP_RCVD) \ + ib_cm_state(ESTABLISHED) \ + ib_cm_state(DREQ_SENT) \ + ib_cm_state(DREQ_RCVD) \ + ib_cm_state(TIMEWAIT) \ + ib_cm_state(SIDR_REQ_SENT) \ + ib_cm_state_end(SIDR_REQ_RCVD) + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_STATE_LIST + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) { IB_CM_##x, #x }, +#define ib_cm_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_state(x) \ + __print_symbolic(x, IB_CM_STATE_LIST) + +/* + * enum ib_cm_lap_state, from include/rdma/ib_cm.h + */ +#define IB_CM_LAP_STATE_LIST \ + ib_cm_lap_state(LAP_UNINIT) \ + ib_cm_lap_state(LAP_IDLE) \ + ib_cm_lap_state(LAP_SENT) \ + ib_cm_lap_state(LAP_RCVD) \ + ib_cm_lap_state(MRA_LAP_SENT) \ + ib_cm_lap_state_end(MRA_LAP_RCVD) + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_lap_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_LAP_STATE_LIST + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) { IB_CM_##x, #x }, +#define ib_cm_lap_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_lap_state(x) \ + __print_symbolic(x, IB_CM_LAP_STATE_LIST) + +/* + * enum ib_cm_rej_reason, from include/rdma/ib_cm.h + */ +#define IB_CM_REJ_REASON_LIST \ + ib_cm_rej_reason(REJ_NO_QP) \ + ib_cm_rej_reason(REJ_NO_EEC) \ + ib_cm_rej_reason(REJ_NO_RESOURCES) \ + ib_cm_rej_reason(REJ_TIMEOUT) \ + ib_cm_rej_reason(REJ_UNSUPPORTED) \ + ib_cm_rej_reason(REJ_INVALID_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_COMM_INSTANCE) \ + ib_cm_rej_reason(REJ_INVALID_SERVICE_ID) \ + ib_cm_rej_reason(REJ_INVALID_TRANSPORT_TYPE) \ + ib_cm_rej_reason(REJ_STALE_CONN) \ + ib_cm_rej_reason(REJ_RDC_NOT_EXIST) \ + ib_cm_rej_reason(REJ_INVALID_GID) \ + ib_cm_rej_reason(REJ_INVALID_LID) \ + ib_cm_rej_reason(REJ_INVALID_SL) \ + ib_cm_rej_reason(REJ_INVALID_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_PACKET_RATE) \ + ib_cm_rej_reason(REJ_INVALID_ALT_GID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_LID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_SL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_ALT_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_ALT_PACKET_RATE) \ + ib_cm_rej_reason(REJ_PORT_CM_REDIRECT) \ + ib_cm_rej_reason(REJ_PORT_REDIRECT) \ + ib_cm_rej_reason(REJ_INVALID_MTU) \ + ib_cm_rej_reason(REJ_INSUFFICIENT_RESP_RESOURCES) \ + ib_cm_rej_reason(REJ_CONSUMER_DEFINED) \ + ib_cm_rej_reason(REJ_INVALID_RNR_RETRY) \ + ib_cm_rej_reason(REJ_DUPLICATE_LOCAL_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_CLASS_VERSION) \ + ib_cm_rej_reason(REJ_INVALID_FLOW_LABEL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_FLOW_LABEL) \ + ib_cm_rej_reason_end(REJ_VENDOR_OPTION_NOT_SUPPORTED) + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_rej_reason_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_REJ_REASON_LIST + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) { IB_CM_##x, #x }, +#define ib_cm_rej_reason_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_rej_reason(x) \ + __print_symbolic(x, IB_CM_REJ_REASON_LIST) + +DECLARE_EVENT_CLASS(icm_id_class, + TP_PROTO( + const struct ib_cm_id *cm_id + ), + + TP_ARGS(cm_id), + + TP_STRUCT__entry( + __field(const void *, cm_id) /* for eBPF scripts */ + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + __field(unsigned long, lap_state) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->lap_state = cm_id->lap_state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s lap_state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_lap_state(__entry->lap_state) + ) +); + +#define DEFINE_CM_SEND_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_send_##name, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_SEND_EVENT(req); +DEFINE_CM_SEND_EVENT(rep); +DEFINE_CM_SEND_EVENT(dup_req); +DEFINE_CM_SEND_EVENT(dup_rep); +DEFINE_CM_SEND_EVENT(rtu); +DEFINE_CM_SEND_EVENT(mra); +DEFINE_CM_SEND_EVENT(sidr_req); +DEFINE_CM_SEND_EVENT(sidr_rep); +DEFINE_CM_SEND_EVENT(dreq); +DEFINE_CM_SEND_EVENT(drep); + +TRACE_EVENT(icm_send_rej, + TP_PROTO( + const struct ib_cm_id *cm_id, + enum ib_cm_rej_reason reason + ), + + TP_ARGS(cm_id, reason), + + TP_STRUCT__entry( + __field(const void *, cm_id) + __field(u32, local_id) + __field(u32, remote_id) + __field(unsigned long, state) + __field(unsigned long, reason) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->reason = reason; + ), + + TP_printk("local_id=%u remote_id=%u state=%s reason=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_rej_reason(__entry->reason) + ) +); + +#define DEFINE_CM_ERR_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_##name##_err, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_ERR_EVENT(send_cm_rtu); +DEFINE_CM_ERR_EVENT(establish); +DEFINE_CM_ERR_EVENT(no_listener); +DEFINE_CM_ERR_EVENT(send_drep); +DEFINE_CM_ERR_EVENT(dreq_unknown); +DEFINE_CM_ERR_EVENT(send_unknown_rej); +DEFINE_CM_ERR_EVENT(rej_unknown); +DEFINE_CM_ERR_EVENT(prepare_mra_unknown); +DEFINE_CM_ERR_EVENT(mra_unknown); +DEFINE_CM_ERR_EVENT(qp_init); +DEFINE_CM_ERR_EVENT(qp_rtr); +DEFINE_CM_ERR_EVENT(qp_rts); + +DEFINE_EVENT(icm_id_class, \ + icm_dreq_skipped, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id) \ +); + +DECLARE_EVENT_CLASS(icm_local_class, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id + ), + + TP_ARGS(local_id, remote_id), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + ), + + TP_printk("local_id=%u remote_id=%u", + __entry->local_id, __entry->remote_id + ) +); + +#define DEFINE_CM_LOCAL_EVENT(name) \ + DEFINE_EVENT(icm_local_class, \ + icm_##name, \ + TP_PROTO( \ + unsigned int local_id, \ + unsigned int remote_id \ + ), \ + TP_ARGS(local_id, remote_id)) + +DEFINE_CM_LOCAL_EVENT(issue_rej); +DEFINE_CM_LOCAL_EVENT(issue_drep); +DEFINE_CM_LOCAL_EVENT(staleconn_err); +DEFINE_CM_LOCAL_EVENT(no_priv_err); + +DECLARE_EVENT_CLASS(icm_remote_class, + TP_PROTO( + u32 remote_id + ), + + TP_ARGS(remote_id), + + TP_STRUCT__entry( + __field(u32, remote_id) + ), + + TP_fast_assign( + __entry->remote_id = remote_id; + ), + + TP_printk("remote_id=%u", + __entry->remote_id + ) +); + +#define DEFINE_CM_REMOTE_EVENT(name) \ + DEFINE_EVENT(icm_remote_class, \ + icm_##name, \ + TP_PROTO( \ + u32 remote_id \ + ), \ + TP_ARGS(remote_id)) + +DEFINE_CM_REMOTE_EVENT(remote_no_priv_err); +DEFINE_CM_REMOTE_EVENT(insert_failed_err); + +TRACE_EVENT(icm_send_rep_err, + TP_PROTO( + __be32 local_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = be32_to_cpu(local_id); + __entry->state = state; + ), + + TP_printk("local_id=%u state=%s", + __entry->local_id, show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_rep_unknown_err, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, remote_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + __entry->state = state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_handler_err, + TP_PROTO( + enum ib_cm_event_type event + ), + + TP_ARGS(event), + + TP_STRUCT__entry( + __field(unsigned long, event) + ), + + TP_fast_assign( + __entry->event = event; + ), + + TP_printk("unhandled event=%s", + rdma_show_ib_cm_event(__entry->event) + ) +); + +TRACE_EVENT(icm_mad_send_err, + TP_PROTO( + enum ib_cm_state state, + enum ib_wc_status wc_status + ), + + TP_ARGS(state, wc_status), + + TP_STRUCT__entry( + __field(unsigned long, state) + __field(unsigned long, wc_status) + ), + + TP_fast_assign( + __entry->state = state; + __entry->wc_status = wc_status; + ), + + TP_printk("state=%s completion status=%s", + show_ib_cm_state(__entry->state), + rdma_show_wc_status(__entry->wc_status) + ) +); + +#endif /* _TRACE_IB_CMA_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../drivers/infiniband/core +#define TRACE_INCLUDE_FILE cm_trace + +#include <trace/define_trace.h> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 84f077b2b90a..95e89f5c147c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1,36 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. - * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. */ #include <linux/completion.h> @@ -38,8 +11,9 @@ #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> +#include <linux/rbtree.h> #include <linux/igmp.h> -#include <linux/idr.h> +#include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> @@ -47,6 +21,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> @@ -63,16 +38,15 @@ #include "core_priv.h" #include "cma_priv.h" +#include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 -#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 -#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) -#define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { @@ -94,6 +68,11 @@ static const char * const cma_events[] = { [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type); + +static void cma_netevent_work_handler(struct work_struct *_work); + const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; @@ -117,7 +96,13 @@ const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, } EXPORT_SYMBOL(rdma_reject_msg); -bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) +/** + * rdma_is_consumer_reject - return true if the consumer rejected the connect + * request. + * @id: Communication identifier that received the REJECT event. + * @reason: Value returned in the REJECT event status field. + */ +static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; @@ -128,7 +113,6 @@ bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) WARN_ON_ONCE(1); return false; } -EXPORT_SYMBOL(rdma_is_consumer_reject); const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) @@ -161,20 +145,7 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) } EXPORT_SYMBOL(rdma_iw_cm_id); -/** - * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. - * @res: rdma resource tracking entry pointer - */ -struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) -{ - struct rdma_id_private *id_priv = - container_of(res, struct rdma_id_private, res); - - return &id_priv->id; -} -EXPORT_SYMBOL(rdma_res_to_id); - -static void cma_add_one(struct ib_device *device); +static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { @@ -187,14 +158,17 @@ static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); +static struct rb_root id_table = RB_ROOT; +/* Serialize operations of id_table tree */ +static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { - struct idr tcp_ps; - struct idr udp_ps; - struct idr ipoib_ps; - struct idr ib_ps; + struct xarray tcp_ps; + struct xarray udp_ps; + struct xarray ipoib_ps; + struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) @@ -202,7 +176,8 @@ static struct cma_pernet *cma_pernet(struct net *net) return net_generic(net, cma_pernet_id); } -static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps) +static +struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); @@ -220,11 +195,16 @@ static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps) } } +struct id_table_entry { + struct list_head id_list; + struct rb_node rb_node; +}; + struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; - atomic_t refcount; + refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; @@ -236,45 +216,43 @@ struct rdma_bind_list { unsigned short port; }; -struct class_port_info_context { - struct ib_class_port_info *class_port_info; - struct ib_device *device; - struct completion done; - struct ib_sa_query *sa_query; - u8 port_num; -}; - static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); + return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - return idr_find(idr, snum); + return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - idr_remove(idr, snum); + xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; -void cma_ref_dev(struct cma_device *cma_dev) +void cma_dev_get(struct cma_device *cma_dev) { - atomic_inc(&cma_dev->refcount); + refcount_inc(&cma_dev->refcount); +} + +void cma_dev_put(struct cma_device *cma_dev) +{ + if (refcount_dec_and_test(&cma_dev->refcount)) + complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, @@ -292,13 +270,13 @@ struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, } if (found_cma_dev) - cma_ref_dev(found_cma_dev); + cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, - unsigned int port) + u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; @@ -307,7 +285,7 @@ int cma_get_default_gid_type(struct cma_device *cma_dev, } int cma_set_default_gid_type(struct cma_device *cma_dev, - unsigned int port, + u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; @@ -315,6 +293,10 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; + if (default_gid_type == IB_GID_TYPE_IB && + rdma_protocol_roce_eth_encap(cma_dev->device, port)) + default_gid_type = IB_GID_TYPE_ROCE; + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) @@ -326,7 +308,7 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, return 0; } -int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port) +int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; @@ -334,7 +316,7 @@ int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port) return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } -int cma_set_default_roce_tos(struct cma_device *cma_dev, unsigned int port, +int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) @@ -360,12 +342,15 @@ struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) struct cma_multicast { struct rdma_id_private *id_priv; union { - struct ib_sa_multicast *ib; - } multicast; + struct ib_sa_multicast *sa_mc; + struct { + struct work_struct work; + struct rdma_cm_event event; + } iboe_join; + }; struct list_head list; void *context; struct sockaddr_storage addr; - struct kref mcref; u8 join_state; }; @@ -377,18 +362,6 @@ struct cma_work { struct rdma_cm_event event; }; -struct cma_ndev_work { - struct work_struct work; - struct rdma_id_private *id; - struct rdma_cm_event event; -}; - -struct iboe_mcast_work { - struct work_struct work; - struct rdma_id_private *id; - struct cma_multicast *mc; -}; - union cma_ip_addr { struct in6_addr ip6; struct { @@ -418,23 +391,21 @@ struct cma_req_info { u16 pkey; }; -static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&id_priv->lock, flags); - ret = (id_priv->state == comp); - spin_unlock_irqrestore(&id_priv->lock, flags); - return ret; -} - static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; + /* + * The FSM uses a funny double locking where state is protected by both + * the handler_mutex and the spinlock. State is not allowed to change + * to/from a handler_mutex protected value without also holding + * handler_mutex. + */ + if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; @@ -442,27 +413,24 @@ static int cma_comp_exch(struct rdma_id_private *id_priv, return ret; } -static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, - enum rdma_cm_state exch) +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { - unsigned long flags; - enum rdma_cm_state old; + return hdr->ip_version >> 4; +} - spin_lock_irqsave(&id_priv->lock, flags); - old = id_priv->state; - id_priv->state = exch; - spin_unlock_irqrestore(&id_priv->lock, flags); - return old; +static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +{ + hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) +static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { - return hdr->ip_version >> 4; + return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } -static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { - hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); + return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) @@ -485,19 +453,135 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) return (in_dev) ? 0 : -ENODEV; } +static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, + struct id_table_entry *entry_b) +{ + struct rdma_id_private *id_priv = list_first_entry( + &entry_b->id_list, struct rdma_id_private, id_list_entry); + int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; + struct sockaddr *sb = cma_dst_addr(id_priv); + + if (ifindex_a != ifindex_b) + return (ifindex_a > ifindex_b) ? 1 : -1; + + if (sa->sa_family != sb->sa_family) + return sa->sa_family - sb->sa_family; + + if (sa->sa_family == AF_INET && + __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { + return memcmp(&((struct sockaddr_in *)sa)->sin_addr, + &((struct sockaddr_in *)sb)->sin_addr, + sizeof(((struct sockaddr_in *)sa)->sin_addr)); + } + + if (sa->sa_family == AF_INET6 && + __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { + return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, + &((struct sockaddr_in6 *)sb)->sin6_addr); + } + + return -1; +} + +static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) +{ + struct rb_node **new, *parent = NULL; + struct id_table_entry *this, *node; + unsigned long flags; + int result; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + spin_lock_irqsave(&id_table_lock, flags); + new = &id_table.rb_node; + while (*new) { + this = container_of(*new, struct id_table_entry, rb_node); + result = compare_netdev_and_ip( + node_id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(node_id_priv), this); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else { + list_add_tail(&node_id_priv->id_list_entry, + &this->id_list); + kfree(node); + goto unlock; + } + } + + INIT_LIST_HEAD(&node->id_list); + list_add_tail(&node_id_priv->id_list_entry, &node->id_list); + + rb_link_node(&node->rb_node, parent, new); + rb_insert_color(&node->rb_node, &id_table); + +unlock: + spin_unlock_irqrestore(&id_table_lock, flags); + return 0; +} + +static struct id_table_entry * +node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) +{ + struct rb_node *node = root->rb_node; + struct id_table_entry *data; + int result; + + while (node) { + data = container_of(node, struct id_table_entry, rb_node); + result = compare_netdev_and_ip(ifindex, sa, data); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + + return NULL; +} + +static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) +{ + struct id_table_entry *data; + unsigned long flags; + + spin_lock_irqsave(&id_table_lock, flags); + if (list_empty(&id_priv->id_list_entry)) + goto out; + + data = node_from_ndev_ip(&id_table, + id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(id_priv)); + if (!data) + goto out; + + list_del_init(&id_priv->id_list_entry); + if (list_empty(&data->id_list)) { + rb_erase(&data->rb_node, &id_table); + kfree(data); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); +} + static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { - cma_ref_dev(cma_dev); + cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); - list_add_tail(&id_priv->list, &cma_dev->id_list); - if (id_priv->res.kern_name) - rdma_restrack_kadd(&id_priv->res); - else - rdma_restrack_uadd(&id_priv->res); + list_add_tail(&id_priv->device_item, &cma_dev->id_list); + + trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -509,60 +593,30 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, rdma_start_port(cma_dev->device)]; } -void cma_deref_dev(struct cma_device *cma_dev) -{ - if (atomic_dec_and_test(&cma_dev->refcount)) - complete(&cma_dev->comp); -} - -static inline void release_mc(struct kref *kref) -{ - struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); - - kfree(mc->multicast.ib); - kfree(mc); -} - static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); - list_del(&id_priv->list); - cma_deref_dev(id_priv->cma_dev); + list_del_init(&id_priv->device_item); + cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; + id_priv->id.device = NULL; + if (id_priv->id.route.addr.dev_addr.sgid_attr) { + rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); + id_priv->id.route.addr.dev_addr.sgid_attr = NULL; + } mutex_unlock(&lock); } -static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.src_addr; -} - -static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; -} - static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } -static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) { - if (qkey && id_priv->qkey != qkey) - return -EINVAL; - return 0; - } - - if (qkey) { - id_priv->qkey = qkey; - return 0; - } - switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: @@ -582,6 +636,16 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) return ret; } +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +{ + if (!qkey || + (id_priv->qkey && (id_priv->qkey != qkey))) + return -EINVAL; + + id_priv->qkey = qkey; + return 0; +} + static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; @@ -604,34 +668,90 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a } static const struct ib_gid_attr * -cma_validate_port(struct ib_device *device, u8 port, +cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; - const struct ib_gid_attr *sgid_attr; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + struct net_device *pdev = NULL; + + if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) + goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) - return ERR_PTR(-ENODEV); + goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) - return ERR_PTR(-ENODEV); + goto out; + + /* + * For drivers that do not associate more than one net device with + * their gid tables, such as iWARP drivers, it is sufficient to + * return the first table entry. + * + * Other driver classes might be included in the future. + */ + if (rdma_protocol_iwarp(device, port)) { + sgid_attr = rdma_get_gid_attr(device, port, 0); + if (IS_ERR(sgid_attr)) + goto out; - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { - ndev = dev_get_by_index(dev_addr->net, bound_if_index); - if (!ndev) - return ERR_PTR(-ENODEV); + rcu_read_lock(); + ndev = rcu_dereference(sgid_attr->ndev); + if (ndev->ifindex != bound_if_index) { + pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); + if (pdev) { + if (is_vlan_dev(pdev)) { + pdev = vlan_dev_real_dev(pdev); + if (ndev->ifindex == pdev->ifindex) + bound_if_index = pdev->ifindex; + } + if (is_vlan_dev(ndev)) { + pdev = vlan_dev_real_dev(ndev); + if (bound_if_index == pdev->ifindex) + bound_if_index = ndev->ifindex; + } + } + } + if (!net_eq(dev_net(ndev), dev_addr->net) || + ndev->ifindex != bound_if_index) { + rdma_put_gid_attr(sgid_attr); + sgid_attr = ERR_PTR(-ENODEV); + } + rcu_read_unlock(); + goto out; + } + + /* + * For a RXE device, it should work with TUN device and normal ethernet + * devices. Use driver_id to check if a device is a RXE device or not. + * ARPHDR_NONE means a TUN device. + */ + if (device->ops.driver_id == RDMA_DRIVER_RXE) { + if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER) + && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } } else { - gid_type = IB_GID_TYPE_IB; + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } else { + gid_type = IB_GID_TYPE_IB; + } } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); - if (ndev) - dev_put(ndev); + dev_put(ndev); +out: return sgid_attr; } @@ -659,7 +779,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; - u8 port; + u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) @@ -673,8 +793,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { - for (port = rdma_start_port(cma_dev->device); - port <= rdma_end_port(cma_dev->device); port++) { + rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; @@ -740,6 +859,7 @@ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); + rdma_restrack_add(&id_priv->res); return 0; } @@ -752,7 +872,7 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; - u8 port; + u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) @@ -776,7 +896,7 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, } list_for_each_entry(cma_dev, &dev_list, list) { - for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; @@ -794,8 +914,10 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, } out: - if (!ret) + if (!ret) { cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); + } mutex_unlock(&lock); return ret; @@ -809,9 +931,10 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; + unsigned int p; u16 pkey, index; - u8 p; enum ib_port_state port_state; + int ret; int i; cma_dev = NULL; @@ -821,7 +944,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { - for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; @@ -830,9 +953,14 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; - for (i = 0; !rdma_query_gid(cur_dev->device, - p, i, &gid); - i++) { + + for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; + ++i) { + ret = rdma_query_gid(cur_dev->device, p, i, + &gid); + if (ret) + continue; + if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; @@ -856,6 +984,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) found: cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); @@ -863,16 +992,21 @@ found: return 0; } -static void cma_deref_id(struct rdma_id_private *id_priv) +static void cma_id_get(struct rdma_id_private *id_priv) { - if (atomic_dec_and_test(&id_priv->refcount)) + refcount_inc(&id_priv->refcount); +} + +static void cma_id_put(struct rdma_id_private *id_priv) +{ + if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } -struct rdma_cm_id *__rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_ucm_port_space ps, - enum ib_qp_type qp_type, const char *caller) +static struct rdma_id_private * +__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; @@ -880,29 +1014,68 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - rdma_restrack_set_task(&id_priv->res, caller); - id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->timeout_set = false; + id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); - atomic_set(&id_priv->refcount, 1); + refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); + INIT_LIST_HEAD(&id_priv->device_item); + INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; + INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler); + + rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); + if (parent) + rdma_restrack_parent_name(&id_priv->res, &parent->res); + + return id_priv; +} + +struct rdma_cm_id * +__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const char *caller) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, caller); + return &ret->id; +} +EXPORT_SYMBOL(__rdma_create_kernel_id); + +struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, + void *context, + enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, + ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); - return &id_priv->id; + rdma_restrack_set_name(&ret->res, NULL); + return &ret->id; } -EXPORT_SYMBOL(__rdma_create_id); +EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { @@ -951,27 +1124,34 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (id->device != pd->device) - return -EINVAL; + if (id->device != pd->device) { + ret = -EINVAL; + goto out_err; + } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); - if (IS_ERR(qp)) - return PTR_ERR(qp); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto out_err; + } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) - goto err; + goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); + trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; -err: +out_destroy: ib_destroy_qp(qp); +out_err: + trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); @@ -981,6 +1161,7 @@ void rdma_destroy_qp(struct rdma_cm_id *id) struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); + trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; @@ -1089,7 +1270,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv, 0); + ret = cma_set_default_qkey(id_priv); if (ret) return ret; @@ -1127,8 +1308,15 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; - } else + } else { ret = -ENOSYS; + } + + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + + if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) + qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } @@ -1170,18 +1358,31 @@ static inline bool cma_any_addr(const struct sockaddr *addr) return cma_zero_addr(addr) || cma_loopback_addr(addr); } -static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) +static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: - return ((struct sockaddr_in *) src)->sin_addr.s_addr != - ((struct sockaddr_in *) dst)->sin_addr.s_addr; - case AF_INET6: - return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, - &((struct sockaddr_in6 *) dst)->sin6_addr); + return ((struct sockaddr_in *)src)->sin_addr.s_addr != + ((struct sockaddr_in *)dst)->sin_addr.s_addr; + case AF_INET6: { + struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; + bool link_local; + + if (ipv6_addr_cmp(&src_addr6->sin6_addr, + &dst_addr6->sin6_addr)) + return 1; + link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & + IPV6_ADDR_LINKLOCAL; + /* Link local must match their scope_ids */ + return link_local ? (src_addr6->sin6_scope_id != + dst_addr6->sin6_scope_id) : + 0; + } + default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); @@ -1403,7 +1604,7 @@ static bool validate_ipv4_net_dev(struct net_device *net_dev, return false; memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_iif = net_dev->ifindex; + fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; @@ -1466,6 +1667,7 @@ static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; + struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; @@ -1474,8 +1676,15 @@ roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) if (!sgid_attr) return NULL; - dev_hold(sgid_attr->ndev); - return sgid_attr->ndev; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); + if (IS_ERR(ndev)) + ndev = NULL; + else + dev_hold(ndev); + rcu_read_unlock(); + return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, @@ -1549,7 +1758,7 @@ static bool cma_match_private_data(struct rdma_id_private *id_priv, static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; - const int port_num = id->port_num ?: rdma_start_port(device); + const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } @@ -1603,6 +1812,8 @@ static struct rdma_id_private *cma_find_listener( { struct rdma_id_private *id_priv, *id_priv_dev; + lockdep_assert_held(&lock); + if (!bind_list) return ERR_PTR(-EINVAL); @@ -1613,7 +1824,7 @@ static struct rdma_id_private *cma_find_listener( return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, - listen_list) { + listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) @@ -1649,6 +1860,7 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } } + mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice @@ -1677,8 +1889,8 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } if (!validate_net_dev(*net_dev, - (struct sockaddr *)&req->listen_addr_storage, - (struct sockaddr *)&req->src_addr_storage)) { + (struct sockaddr *)&req->src_addr_storage, + (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } @@ -1690,6 +1902,7 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); + mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; @@ -1710,28 +1923,36 @@ static void cma_cancel_route(struct rdma_id_private *id_priv) } } -static void cma_cancel_listens(struct rdma_id_private *id_priv) +static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; + lockdep_assert_held(&lock); + /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ - mutex_lock(&lock); - list_del(&id_priv->list); + list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { - dev_id_priv = list_entry(id_priv->listen_list.next, - struct rdma_id_private, listen_list); + dev_id_priv = + list_first_entry(&id_priv->listen_list, + struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ - list_del_init(&dev_id_priv->list); - list_del(&dev_id_priv->listen_list); + list_del_init(&dev_id_priv->device_item); + list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + _cma_cancel_listens(id_priv); mutex_unlock(&lock); } @@ -1740,6 +1961,14 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, { switch (state) { case RDMA_CM_ADDR_QUERY: + /* + * We can avoid doing the rdma_addr_cancel() based on state, + * only RDMA_CM_ADDR_QUERY has a work that could still execute. + * Notice that the addr_handler work could still be exiting + * outside this state, however due to the interaction with the + * handler_mutex the work is guaranteed not to touch id_priv + * during exit. + */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: @@ -1771,19 +2000,39 @@ static void cma_release_port(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static void cma_leave_roce_mc_group(struct rdma_id_private *id_priv, - struct cma_multicast *mc) +static void destroy_mc(struct rdma_id_private *id_priv, + struct cma_multicast *mc) { - struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = NULL; + bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); - if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (ndev) { - cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, false); + if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) + ib_sa_free_multicast(mc->sa_mc); + + if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev && !send_only) { + enum ib_gid_type gid_type; + union ib_gid mgid; + + gid_type = id_priv->cma_dev->default_gid_type + [id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, + gid_type); + cma_igmp_send(ndev, &mgid, false); + } dev_put(ndev); + + cancel_work_sync(&mc->iboe_join.work); } - kref_put(&mc->mcref, release_mc); + kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) @@ -1791,36 +2040,20 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { - mc = container_of(id_priv->mc_list.next, - struct cma_multicast, list); + mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, + list); list_del(&mc->list); - if (rdma_cap_ib_mcast(id_priv->cma_dev->device, - id_priv->id.port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else { - cma_leave_roce_mc_group(id_priv, mc); - } + destroy_mc(id_priv, mc); } } -void rdma_destroy_id(struct rdma_cm_id *id) +static void _destroy_id(struct rdma_id_private *id_priv, + enum rdma_cm_state state) { - struct rdma_id_private *id_priv; - enum rdma_cm_state state; - - id_priv = container_of(id, struct rdma_id_private, id); - state = cma_exch(id_priv, RDMA_CM_DESTROYING); cma_cancel_operation(id_priv, state); - /* - * Wait for any active callback to finish. New callbacks will find - * the id_priv state set to destroying and abort. - */ - mutex_lock(&id_priv->handler_mutex); - mutex_unlock(&id_priv->handler_mutex); - rdma_restrack_del(&id_priv->res); + cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) @@ -1834,20 +2067,56 @@ void rdma_destroy_id(struct rdma_cm_id *id) } cma_release_port(id_priv); - cma_deref_id(id_priv); + cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) - cma_deref_id(id_priv->id.context); + cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); - - if (id_priv->id.route.addr.dev_addr.sgid_attr) - rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); + kfree(id_priv->id.route.path_rec_inbound); + kfree(id_priv->id.route.path_rec_outbound); + kfree(id_priv->id.route.service_recs); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } + +/* + * destroy an ID from within the handler_mutex. This ensures that no other + * handlers can start running concurrently. + */ +static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) + __releases(&idprv->handler_mutex) +{ + enum rdma_cm_state state; + unsigned long flags; + + trace_cm_id_destroy(id_priv); + + /* + * Setting the state to destroyed under the handler mutex provides a + * fence against calling handler callbacks. If this is invoked due to + * the failure of a handler callback then it guarentees that no future + * handlers will be called. + */ + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + id_priv->state = RDMA_CM_DESTROYING; + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + _destroy_id(id_priv, state); +} + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); + destroy_id_handler_unlock(id_priv); +} EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) @@ -1862,6 +2131,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) if (ret) goto reject; + trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; @@ -1870,6 +2140,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); + trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; @@ -1887,6 +2158,22 @@ static void cma_set_rep_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; + + event->ece.vendor_id = rep_data->ece.vendor_id; + event->ece.attr_mod = rep_data->ece.attr_mod; +} + +static int cma_cm_event_handler(struct rdma_id_private *id_priv, + struct rdma_cm_event *event) +{ + int ret; + + lockdep_assert_held(&id_priv->handler_mutex); + + trace_cm_event_handler(id_priv, event); + ret = id_priv->id.event_handler(&id_priv->id, event); + trace_cm_event_done(id_priv, event, ret); + return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, @@ -1894,13 +2181,15 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; - int ret = 0; + enum rdma_cm_state state; + int ret; mutex_lock(&id_priv->handler_mutex); + state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_CONNECT) || + state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_DISCONNECT)) + state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { @@ -1910,9 +2199,11 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: - if (cma_comp(id_priv, RDMA_CM_CONNECT) && - (id_priv->id.qp_type != IB_QPT_UD)) - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + if (state == RDMA_CM_CONNECT && + (id_priv->id.qp_type != IB_QPT_UD)) { + trace_cm_prepare_mra(id_priv); + ib_prepare_cm_mra(cm_id); + } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : @@ -1928,7 +2219,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: - event.status = -ETIMEDOUT; /* fall through */ + event.status = -ETIMEDOUT; + fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, @@ -1957,18 +2249,16 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, goto out; } - ret = id_priv->id.event_handler(&id_priv->id, &event); + ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); - return ret; + return 0; } static struct rdma_id_private * @@ -1987,28 +2277,29 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); - id = __rdma_create_id(listen_id->route.addr.dev_addr.net, - listen_id->event_handler, listen_id->context, - listen_id->ps, ib_event->param.req_rcvd.qp_type, - listen_id_priv->res.kern_name); - if (IS_ERR(id)) + id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, + listen_id->ps, + ib_event->param.req_rcvd.qp_type, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; - rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; - rt->path_rec = kmalloc_array(rt->num_paths, sizeof(*rt->path_rec), - GFP_KERNEL); + rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; + rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, + sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; - if (rt->num_paths == 2) + if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { @@ -2048,13 +2339,13 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); - id = __rdma_create_id(net, listen_id->event_handler, listen_id->context, - listen_id->ps, IB_QPT_UD, - listen_id_priv->res.kern_name); - if (IS_ERR(id)) + id_priv = __rdma_create_id(net, listen_id->event_handler, + listen_id->context, listen_id->ps, IB_QPT_UD, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, @@ -2092,6 +2383,9 @@ static void cma_set_req_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; + + event->ece.vendor_id = req_data->ece.vendor_id; + event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, @@ -2118,15 +2412,16 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, if (IS_ERR(listen_id)) return PTR_ERR(listen_id); + trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) { + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; - goto err1; + goto err_unlock; } offset = cma_user_data_offset(listen_id); @@ -2143,57 +2438,41 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } if (!conn_id) { ret = -ENOMEM; - goto err1; + goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); - if (ret) - goto err2; + if (ret) { + destroy_id_handler_unlock(conn_id); + goto err_unlock; + } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; - /* - * Protect against the user destroying conn_id from another thread - * until we're done accessing it. - */ - atomic_inc(&conn_id->refcount); - ret = conn_id->id.event_handler(&conn_id->id, &event); - if (ret) - goto err3; - /* - * Acquire mutex to prevent user executing rdma_destroy_id() - * while we're accessing the cm_id. - */ - mutex_lock(&lock); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && - (conn_id->id.qp_type != IB_QPT_UD)) - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); - mutex_unlock(&lock); - mutex_unlock(&conn_id->handler_mutex); - mutex_unlock(&listen_id->handler_mutex); - cma_deref_id(conn_id); - if (net_dev) - dev_put(net_dev); - return 0; + ret = cma_cm_event_handler(conn_id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + conn_id->cm_id.ib = NULL; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + goto net_dev_put; + } -err3: - cma_deref_id(conn_id); - /* Destroy the CM ID by returning a non-zero value. */ - conn_id->cm_id.ib = NULL; -err2: - cma_exch(conn_id, RDMA_CM_DESTROYING); + if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && + conn_id->id.qp_type != IB_QPT_UD) { + trace_cm_prepare_mra(cm_id->context); + ib_prepare_cm_mra(cm_id); + } mutex_unlock(&conn_id->handler_mutex); -err1: + +err_unlock: mutex_unlock(&listen_id->handler_mutex); - if (conn_id) - rdma_destroy_id(&conn_id->id); net_dev_put: - if (net_dev) - dev_put(net_dev); + dev_put(net_dev); return ret; } @@ -2243,7 +2522,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { @@ -2285,13 +2564,11 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; - ret = id_priv->id.event_handler(&id_priv->id, &event); + ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } @@ -2303,7 +2580,6 @@ out: static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { - struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; @@ -2319,35 +2595,33 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, - listen_id->id.event_handler, - listen_id->id.context, - RDMA_PS_TCP, IB_QPT_RC, - listen_id->res.kern_name); - if (IS_ERR(new_cm_id)) { + conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, + listen_id->id.context, RDMA_PS_TCP, + IB_QPT_RC, listen_id); + if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } - conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } conn_id->cm_id.iw = cm_id; @@ -2357,24 +2631,16 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - /* - * Protect against the user destroying conn_id from another thread - * until we're done accessing it. - */ - atomic_inc(&conn_id->refcount); - ret = conn_id->id.event_handler(&conn_id->id, &event); + ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; - cma_exch(conn_id, RDMA_CM_DESTROYING); - mutex_unlock(&conn_id->handler_mutex); - cma_deref_id(conn_id); - rdma_destroy_id(&conn_id->id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } mutex_unlock(&conn_id->handler_mutex); - cma_deref_id(conn_id); out: mutex_unlock(&listen_id->handler_mutex); @@ -2409,7 +2675,11 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) if (IS_ERR(id)) return PTR_ERR(id); + mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; + id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -2430,54 +2700,88 @@ static int cma_listen_handler(struct rdma_cm_id *id, { struct rdma_id_private *id_priv = id->context; + /* Listening IDs are always destroyed on removal */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + return -1; + id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; + trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } -static void cma_listen_on_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) +static int cma_listen_on_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev, + struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; - struct rdma_cm_id *id; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; - if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) - return; + lockdep_assert_held(&lock); - id = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, - id_priv->id.qp_type, id_priv->res.kern_name); - if (IS_ERR(id)) - return; + *to_destroy = NULL; + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) + return 0; - dev_id_priv = container_of(id, struct rdma_id_private, id); + dev_id_priv = + __rdma_create_id(net, cma_listen_handler, id_priv, + id_priv->id.ps, id_priv->id.qp_type, id_priv); + if (IS_ERR(dev_id_priv)) + return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); - list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); - atomic_inc(&id_priv->refcount); + rdma_restrack_add(&dev_id_priv->res); + cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; + mutex_lock(&id_priv->qp_mutex); + dev_id_priv->tos_set = id_priv->tos_set; + dev_id_priv->tos = id_priv->tos; + mutex_unlock(&id_priv->qp_mutex); - ret = rdma_listen(id, id_priv->backlog); + ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) - dev_warn(&cma_dev->device->dev, - "RDMA CMA: cma_listen_on_dev, error %d\n", ret); + goto err_listen; + list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); + return 0; +err_listen: + /* Caller must destroy this after releasing lock */ + *to_destroy = dev_id_priv; + dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); + return ret; } -static void cma_listen_on_all(struct rdma_id_private *id_priv) +static int cma_listen_on_all(struct rdma_id_private *id_priv) { + struct rdma_id_private *to_destroy; struct cma_device *cma_dev; + int ret; mutex_lock(&lock); - list_add_tail(&id_priv->list, &listen_any_list); - list_for_each_entry(cma_dev, &dev_list, list) - cma_listen_on_dev(id_priv, cma_dev); + list_add_tail(&id_priv->listen_any_item, &listen_any_list); + list_for_each_entry(cma_dev, &dev_list, list) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) { + /* Prevent racing with cma_process_remove() */ + if (to_destroy) + list_del_init(&to_destroy->device_item); + goto err_listen; + } + } mutex_unlock(&lock); + return 0; + +err_listen: + _cma_cancel_listens(id_priv); + mutex_unlock(&lock); + if (to_destroy) + rdma_destroy_id(&to_destroy->id); + return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) @@ -2485,31 +2789,154 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; + mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. In case it is + * set before rdma_resolve_route, the value will also be used to determine + * PacketLifeTime for RoCE. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + mutex_unlock(&id_priv->qp_mutex); + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + +/** + * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the + * QP associated with a connection identifier. + * @id: Communication identifier to associated with service type. + * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK + * Timer Field" in the IBTA specification. + * + * This function should be called before rdma_connect() on active + * side, and on passive side before rdma_accept(). The timer value + * will be associated with the local QP. When it receives a send it is + * not read to handle, typically if the receive queue is empty, an RNR + * Retry NAK is returned to the requester with the min_rnr_timer + * encoded. The requester will then wait at least the time specified + * in the NAK before retrying. The default is zero, which translates + * to a minimum RNR Timer value of 655 ms. + * + * Return: 0 for success + */ +int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) +{ + struct rdma_id_private *id_priv; + + /* It is a five-bit value */ + if (min_rnr_timer & 0xe0) + return -EINVAL; + + if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->min_rnr_timer = min_rnr_timer; + id_priv->min_rnr_timer_set = true; + mutex_unlock(&id_priv->qp_mutex); + + return 0; +} +EXPORT_SYMBOL(rdma_set_min_rnr_timer); + +static int route_set_path_rec_inbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_inbound) { + route->path_rec_inbound = + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); + if (!route->path_rec_inbound) + return -ENOMEM; + } + + *route->path_rec_inbound = *path_rec; + return 0; +} + +static int route_set_path_rec_outbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_outbound) { + route->path_rec_outbound = + kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); + if (!route->path_rec_outbound) + return -ENOMEM; + } + + *route->path_rec_outbound = *path_rec; + return 0; +} + static void cma_query_handler(int status, struct sa_path_rec *path_rec, - void *context) + unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; + int i; route = &work->id->id.route; - if (!status) { - route->num_paths = 1; - *route->path_rec = *path_rec; - } else { - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; - work->event.status = status; - pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", - status); + if (status) + goto fail; + + for (i = 0; i < num_prs; i++) { + if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) + *route->path_rec = path_rec[i]; + else if (path_rec[i].flags & IB_PATH_INBOUND) + status = route_set_path_rec_inbound(work, &path_rec[i]); + else if (path_rec[i].flags & IB_PATH_OUTBOUND) + status = route_set_path_rec_outbound(work, + &path_rec[i]); + else + status = -EINVAL; + + if (status) + goto fail; } + route->num_pri_alt_paths = 1; + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + work->event.status = status; + pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", + status); queue_work(cma_wq, &work->work); } @@ -2566,49 +2993,54 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, return (id_priv->query_id < 0) ? id_priv->query_id : 0; } -static void cma_work_handler(struct work_struct *_work) +static void cma_iboe_join_work_handler(struct work_struct *work) { - struct cma_work *work = container_of(_work, struct cma_work, work); - struct rdma_id_private *id_priv = work->id; - int destroy = 0; + struct cma_multicast *mc = + container_of(work, struct cma_multicast, iboe_join.work); + struct rdma_cm_event *event = &mc->iboe_join.event; + struct rdma_id_private *id_priv = mc->id_priv; + int ret; mutex_lock(&id_priv->handler_mutex); - if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) - goto out; + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; - if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; - } -out: + ret = cma_cm_event_handler(id_priv, event); + WARN_ON(ret); + +out_unlock: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); - kfree(work); + if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&event->param.ud.ah_attr); } -static void cma_ndev_work_handler(struct work_struct *_work) +static void cma_work_handler(struct work_struct *_work) { - struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); + struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; - int destroy = 0; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == RDMA_CM_DESTROYING || - id_priv->state == RDMA_CM_DEVICE_REMOVAL) - goto out; + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + if (work->old_state != 0 || work->new_state != 0) { + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out_unlock; + } - if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; + if (cma_cm_event_handler(id_priv, &work->event)) { + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + goto out_free; } -out: +out_unlock: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); + cma_id_put(id_priv); +out_free: + if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } @@ -2622,14 +3054,19 @@ static void cma_init_resolve_route_work(struct cma_work *work, work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } -static void cma_init_resolve_addr_work(struct cma_work *work, - struct rdma_id_private *id_priv) +static void enqueue_resolve_addr_work(struct cma_work *work, + struct rdma_id_private *id_priv) { + /* Balances with cma_id_put() in cma_work_handler */ + cma_id_get(id_priv); + work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + + queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, @@ -2645,7 +3082,8 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, cma_init_resolve_route_work(work, id_priv); - route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) + route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; @@ -2743,7 +3181,7 @@ int rdma_set_ib_path(struct rdma_cm_id *id, dev_put(ndev); } - id->route.num_paths = 1; + id->route.num_pri_alt_paths = 1; return 0; err_free: @@ -2768,22 +3206,86 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv) return 0; } -static int iboe_tos_to_sl(struct net_device *ndev, int tos) +static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { - int prio; struct net_device *dev; - prio = rt_tos2priority(tos); - dev = is_vlan_dev(ndev) ? vlan_dev_real_dev(ndev) : ndev; + dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); -#if IS_ENABLED(CONFIG_VLAN_8021Q) + return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +} + +struct iboe_prio_tc_map { + int input_prio; + int output_tc; + bool found; +}; + +static int get_lower_vlan_dev_tc(struct net_device *dev, + struct netdev_nested_priv *priv) +{ + struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; + + if (is_vlan_dev(dev)) + map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); + else if (dev->num_tc) + map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); + else + map->output_tc = 0; + /* We are interested only in first level VLAN device, so always + * return 1 to stop iterating over next level devices. + */ + map->found = true; + return 1; +} + +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + struct iboe_prio_tc_map prio_tc_map = {}; + int prio = rt_tos2priority(tos); + struct netdev_nested_priv priv; + + /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) - return (vlan_dev_get_egress_qos_mask(ndev, prio) & - VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; -#endif - return 0; + return get_vlan_ndev_tc(ndev, prio); + + prio_tc_map.input_prio = prio; + priv.data = (void *)&prio_tc_map; + rcu_read_lock(); + netdev_walk_all_lower_dev_rcu(ndev, + get_lower_vlan_dev_tc, + &priv); + rcu_read_unlock(); + /* If map is found from lower device, use it; Otherwise + * continue with the current netdevice to get priority to tc map. + */ + if (prio_tc_map.found) + return prio_tc_map.output_tc; + else if (ndev->num_tc) + return netdev_get_prio_tc_map(ndev, prio); + else + return 0; +} + +static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) +{ + struct sockaddr_in6 *addr6; + u16 dport, sport; + u32 hash, fl; + + addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); + fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; + if ((cma_family(id_priv) != AF_INET6) || !fl) { + dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); + sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); + hash = (u32)sport * 31 + dport; + fl = hash & IB_GRH_FLOWLABEL_MASK; + } + + return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) @@ -2796,8 +3298,11 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; - u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; + u8 tos; + mutex_lock(&id_priv->qp_mutex); + tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; + mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) @@ -2809,7 +3314,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err1; } - route->num_paths = 1; + route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { @@ -2834,15 +3339,33 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; - route->path_rec->rate = iboe_get_rate(ndev); + route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; - route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + /* In case ACK timeout is set, use this value to calculate + * PacketLifeTime. As per IBTA 12.7.34, + * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). + * Assuming a negligible local ACK delay, we can use + * PacketLifeTime = local ACK timeout/2 + * as a reasonable approximation for RoCE networks. + */ + mutex_lock(&id_priv->qp_mutex); + if (id_priv->timeout_set && id_priv->timeout) + route->path_rec->packet_life_time = id_priv->timeout - 1; + else + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + mutex_unlock(&id_priv->qp_mutex); + if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } + if (rdma_protocol_roce_udp_encap(id_priv->id.device, + id_priv->id.port_num)) + route->path_rec->flow_label = + cma_get_roce_udp_flow_label(id_priv); + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); @@ -2851,6 +3374,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) err2: kfree(route->path_rec); route->path_rec = NULL; + route->num_pri_alt_paths = 0; err1: kfree(work); return ret; @@ -2859,17 +3383,28 @@ err1: int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; + enum rdma_cm_state state; int ret; + if (!timeout_ms) + return -EINVAL; + id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) + state = id_priv->state; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY) && + !cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_RESOLVED, + RDMA_CM_ROUTE_QUERY)) return -EINVAL; - atomic_inc(&id_priv->refcount); + cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); - else if (rdma_protocol_roce(id->device, id->port_num)) + else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); + if (!ret) + cma_add_id_to_tree(id_priv); + } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else @@ -2880,8 +3415,8 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) return 0; err: - cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); - cma_deref_id(id_priv); + cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, state); + cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); @@ -2908,9 +3443,9 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; + unsigned int p; u16 pkey; int ret; - u8 p; cma_dev = NULL; mutex_lock(&lock); @@ -2922,7 +3457,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) if (!cma_dev) cma_dev = cur_dev; - for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; @@ -2955,6 +3490,7 @@ port_found: ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); @@ -2966,23 +3502,35 @@ static void addr_handler(int status, struct sockaddr *src_addr, { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; + struct sockaddr *addr; + struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; - memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); + /* + * Store the previous src address, so that if we fail to acquire + * matching rdma device, old address can be restored back, which helps + * to cancel the cma listen operation correctly. + */ + addr = cma_src_addr(id_priv); + memcpy(&old_addr, addr, rdma_addr_size(addr)); + memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); - } else { + rdma_restrack_add(&id_priv->res); + } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { + memcpy(addr, &old_addr, + rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; @@ -2991,10 +3539,8 @@ static void addr_handler(int status, struct sockaddr *src_addr, } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; - if (id_priv->id.event_handler(&id_priv->id, &event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + if (cma_cm_event_handler(id_priv, &event)) { + destroy_id_handler_unlock(id_priv); return; } out: @@ -3020,8 +3566,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - cma_init_resolve_addr_work(work, id_priv); - queue_work(cma_wq, &work->work); + enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); @@ -3046,77 +3591,13 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); - cma_init_resolve_addr_work(work, id_priv); - queue_work(cma_wq, &work->work); + enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } -static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr) -{ - if (!src_addr || !src_addr->sa_family) { - src_addr = (struct sockaddr *) &id->route.addr.src_addr; - src_addr->sa_family = dst_addr->sa_family; - if (IS_ENABLED(CONFIG_IPV6) && - dst_addr->sa_family == AF_INET6) { - struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; - struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; - src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; - if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; - } else if (dst_addr->sa_family == AF_IB) { - ((struct sockaddr_ib *) src_addr)->sib_pkey = - ((struct sockaddr_ib *) dst_addr)->sib_pkey; - } - } - return rdma_bind_addr(id, src_addr); -} - -int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, unsigned long timeout_ms) -{ - struct rdma_id_private *id_priv; - int ret; - - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == RDMA_CM_IDLE) { - ret = cma_bind_addr(id, src_addr, dst_addr); - if (ret) - return ret; - } - - if (cma_family(id_priv) != dst_addr->sa_family) - return -EINVAL; - - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) - return -EINVAL; - - memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - if (cma_any_addr(dst_addr)) { - ret = cma_resolve_loopback(id_priv); - } else { - if (dst_addr->sa_family == AF_IB) { - ret = cma_resolve_ib_addr(id_priv); - } else { - ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, - &id->route.addr.dev_addr, - timeout_ms, addr_handler, - false, id_priv); - } - } - if (ret) - goto err; - - return 0; -err: - cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - return ret; -} -EXPORT_SYMBOL(rdma_resolve_addr); - int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; @@ -3125,7 +3606,8 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (reuse || id_priv->state == RDMA_CM_IDLE) { + if ((reuse && id_priv->state != RDMA_CM_LISTEN) || + id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { @@ -3164,6 +3646,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, u64 sid, mask; __be16 port; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); port = htons(bind_list->port); @@ -3192,6 +3676,8 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list; int ret; + lockdep_assert_held(&lock); + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; @@ -3202,7 +3688,7 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps, goto err; bind_list->ps = ps; - bind_list->port = (unsigned short)ret; + bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: @@ -3218,6 +3704,8 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); + lockdep_assert_held(&lock); + hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); @@ -3257,9 +3745,11 @@ static int cma_alloc_any_port(enum rdma_ucm_port_space ps, unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; + lockdep_assert_held(&lock); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32() % remaining + low; + rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; @@ -3304,13 +3794,14 @@ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; - if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && - cur_id->reuseaddr) + if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); @@ -3334,6 +3825,8 @@ static int cma_use_port(enum rdma_ucm_port_space ps, unsigned short snum; int ret; + lockdep_assert_held(&lock); + snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; @@ -3349,18 +3842,6 @@ static int cma_use_port(enum rdma_ucm_port_space ps, return ret; } -static int cma_bind_listen(struct rdma_id_private *id_priv) -{ - struct rdma_bind_list *bind_list = id_priv->bind_list; - int ret = 0; - - mutex_lock(&lock); - if (bind_list->owners.first->next) - ret = cma_check_port(bind_list, id_priv, 0); - mutex_unlock(&lock); - return ret; -} - static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { @@ -3454,28 +3935,41 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, int rdma_listen(struct rdma_cm_id *id, int backlog) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == RDMA_CM_IDLE) { - id->route.addr.src_addr.ss_family = AF_INET; - ret = rdma_bind_addr(id, cma_src_addr(id_priv)); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { + struct sockaddr_in any_in = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN))) + return -EINVAL; } - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) - return -EINVAL; - + /* + * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable + * any more, and has to be unique in the bind list. + */ if (id_priv->reuseaddr) { - ret = cma_bind_listen(id_priv); + mutex_lock(&lock); + ret = cma_check_port(id_priv->bind_list, id_priv, 0); + if (!ret) + id_priv->reuseaddr = 0; + mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; - if (id->device) { + if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) @@ -3488,38 +3982,44 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) ret = -ENOSYS; goto err; } - } else - cma_listen_on_all(id_priv); + } else { + ret = cma_listen_on_all(id_priv); + if (ret) + goto err; + } return 0; err: id_priv->backlog = 0; + /* + * All the failure paths that lead here will not allow the req_handler's + * to have run. + */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); -int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, + struct sockaddr *addr, const struct sockaddr *daddr) { - struct rdma_id_private *id_priv; + struct sockaddr *id_daddr; int ret; - struct sockaddr *daddr; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; - id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; - ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = cma_translate_addr(addr, &id->route.addr.dev_addr); + ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; @@ -3539,22 +4039,148 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) } #endif } - daddr = cma_dst_addr(id_priv); - daddr->sa_family = addr->sa_family; + id_daddr = cma_dst_addr(id_priv); + if (daddr != id_daddr) + memcpy(id_daddr, daddr, rdma_addr_size(addr)); + id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) goto err2; + if (!cma_any_addr(addr)) + rdma_restrack_add(&id_priv->res); return 0; err2: - rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + struct sockaddr_storage zero_sock = {}; + + if (src_addr && src_addr->sa_family) + return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); + + /* + * When the src_addr is not specified, automatically supply an any addr + */ + zero_sock.ss_family = dst_addr->sa_family; + if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&zero_sock; + struct sockaddr_in6 *dst_addr6 = + (struct sockaddr_in6 *)dst_addr; + + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = + dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *)&zero_sock)->sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; + } + return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); +} + +/* + * If required, resolve the source address for bind and leave the id_priv in + * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior + * calls made by ULP, a previously bound ID will not be re-bound and src_addr is + * ignored. + */ +static int resolve_prepare_src(struct rdma_id_private *id_priv, + struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + int ret; + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); + if (ret) + return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_ADDR_QUERY))) + return -EINVAL; + + } else { + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + } + + if (cma_family(id_priv) != dst_addr->sa_family) { + ret = -EINVAL; + goto err_state; + } + return 0; + +err_state: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + ret = resolve_prepare_src(id_priv, src_addr, dst_addr); + if (ret) + return ret; + + if (cma_any_addr(dst_addr)) { + ret = cma_resolve_loopback(id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); + } + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_addr); + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); +} EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) @@ -3594,10 +4220,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; - int ret = 0; + int ret; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { @@ -3638,20 +4264,18 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, goto out; } - ret = id_priv->id.event_handler(&id_priv->id, &event); + ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); - return ret; + return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, @@ -3665,8 +4289,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); - req.private_data_len = offset + conn_param->private_data_len; - if (req.private_data_len < conn_param->private_data_len) + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { @@ -3702,6 +4325,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; + trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); @@ -3724,8 +4348,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); - req.private_data_len = offset + conn_param->private_data_len; - if (req.private_data_len < conn_param->private_data_len) + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { @@ -3756,7 +4379,9 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, } req.primary_path = &route->path_rec[0]; - if (route->num_paths == 2) + req.primary_path_inbound = route->path_rec_inbound; + req.primary_path_outbound = route->path_rec_outbound; + if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; @@ -3774,7 +4399,10 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; + req.ece.vendor_id = id_priv->ece.vendor_id; + req.ece.attr_mod = id_priv->ece.attr_mod; + trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { @@ -3797,7 +4425,11 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, if (IS_ERR(cm_id)) return PTR_ERR(cm_id); + mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; + cm_id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), @@ -3828,12 +4460,23 @@ out: return ret; } -int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +/** + * rdma_connect_locked - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Same as rdma_connect() but can only be called from the + * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. + */ +int rdma_connect_locked(struct rdma_cm_id *id, + struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); + lockdep_assert_held(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; @@ -3847,20 +4490,66 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); - } else if (rdma_cap_iw_cm(id->device, id->port_num)) + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); - else + } else { ret = -ENOSYS; + } if (ret) - goto err; - + goto err_state; return 0; -err: +err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } +EXPORT_SYMBOL(rdma_connect_locked); + +/** + * rdma_connect - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Users must have resolved a route for the rdma_cm_id to connect with by having + * called rdma_resolve_route before calling this routine. + * + * This call will either connect to a remote QP or obtain remote QP information + * for unconnected rdma_cm_id's. The actual operation is based on the + * rdma_cm_id's port space. + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + mutex_lock(&id_priv->handler_mutex); + ret = rdma_connect_locked(id, conn_param); + mutex_unlock(&id_priv->handler_mutex); + return ret; +} EXPORT_SYMBOL(rdma_connect); +/** + * rdma_connect_ece - Initiate an active connection request with ECE data. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * @ece: ECE parameters + * + * See rdma_connect() explanation. + */ +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_connect(id, conn_param); +} +EXPORT_SYMBOL(rdma_connect_ece); + static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { @@ -3886,7 +4575,10 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; + trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; @@ -3909,9 +4601,9 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; - if (id_priv->id.qp) { + if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; - } else + else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); @@ -3927,29 +4619,53 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv, qkey); + if (qkey) + ret = cma_set_qkey(id_priv, qkey); + else + ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; + + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; } + rep.private_data = private_data; rep.private_data_len = private_data_len; + trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } -int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, - const char *caller) +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + * + * In the case of error, a reject message is sent to the remote side and the + * state of the qp associated with the id is modified to error, such that any + * previously posted receive buffers would be flushed. + * + * This function is for use by kernel ULPs and must be called from under the + * handler callback. + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - - rdma_restrack_set_task(&id_priv->res, caller); + lockdep_assert_held(&id_priv->handler_mutex); - if (!cma_comp(id_priv, RDMA_CM_CONNECT)) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { @@ -3973,21 +4689,52 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, else ret = cma_rep_recv(id_priv); } - } else if (rdma_cap_iw_cm(id->device, id->port_num)) + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); - else + } else { ret = -ENOSYS; - + } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); - rdma_reject(id, NULL, 0); + rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } -EXPORT_SYMBOL(__rdma_accept); +EXPORT_SYMBOL(rdma_accept); + +int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_accept(id, conn_param); +} +EXPORT_SYMBOL(rdma_accept_ece); + +void rdma_lock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_lock_handler); + +void rdma_unlock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_unlock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { @@ -4011,7 +4758,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, - u8 private_data_len) + u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; @@ -4021,18 +4768,20 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { - if (id->qp_type == IB_QPT_UD) + if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); - else - ret = ib_send_cm_rej(id_priv->cm_id.ib, - IB_CM_REJ_CONSUMER_DEFINED, NULL, - 0, private_data, private_data_len); + } else { + trace_cm_send_rej(id_priv); + ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, + private_data, private_data_len); + } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); - } else + } else { ret = -ENOSYS; + } return ret; } @@ -4052,8 +4801,13 @@ int rdma_disconnect(struct rdma_cm_id *id) if (ret) goto out; /* Initiate or respond to a disconnect. */ - if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) - ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + trace_cm_disconnect(id_priv); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { + if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) + trace_cm_sent_drep(id_priv); + } else { + trace_cm_sent_dreq(id_priv); + } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else @@ -4064,70 +4818,68 @@ out: } EXPORT_SYMBOL(rdma_disconnect); +static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, + struct ib_sa_multicast *multicast, + struct rdma_cm_event *event, + struct cma_multicast *mc) +{ + struct rdma_dev_addr *dev_addr; + enum ib_gid_type gid_type; + struct net_device *ndev; + + if (status) + pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", + status); + + event->status = status; + event->param.ud.private_data = mc->context; + if (status) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + return; + } + + dev_addr = &id_priv->id.route.addr.dev_addr; + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + gid_type = + id_priv->cma_dev + ->default_gid_type[id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + + event->event = RDMA_CM_EVENT_MULTICAST_JOIN; + if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, + &multicast->rec, ndev, gid_type, + &event->param.ud.ah_attr)) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + goto out; + } + + event->param.ud.qp_num = 0xFFFFFF; + event->param.ud.qkey = id_priv->qkey; + +out: + dev_put(ndev); +} + static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { - struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; + struct rdma_id_private *id_priv = mc->id_priv; struct rdma_cm_event event = {}; int ret = 0; - id_priv = mc->id_priv; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_ADDR_BOUND && - id_priv->state != RDMA_CM_ADDR_RESOLVED) + if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || + READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; - if (!status) - status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); - else - pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", - status); - mutex_lock(&id_priv->qp_mutex); - if (!status && id_priv->id.qp) { - status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, - be16_to_cpu(multicast->rec.mlid)); - if (status) - pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to attach QP. status %d\n", - status); + ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); + if (!ret) { + cma_make_mc_event(status, id_priv, multicast, &event, mc); + ret = cma_cm_event_handler(id_priv, &event); } - mutex_unlock(&id_priv->qp_mutex); - - event.status = status; - event.param.ud.private_data = mc->context; - if (!status) { - struct rdma_dev_addr *dev_addr = - &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = - dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - enum ib_gid_type gid_type = - id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; - - event.event = RDMA_CM_EVENT_MULTICAST_JOIN; - ret = ib_init_ah_from_mcmember(id_priv->id.device, - id_priv->id.port_num, - &multicast->rec, - ndev, gid_type, - &event.param.ud.ah_attr); - if (ret) - event.event = RDMA_CM_EVENT_MULTICAST_ERROR; - - event.param.ud.qp_num = 0xFFFFFF; - event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - if (ndev) - dev_put(ndev); - } else - event.event = RDMA_CM_EVENT_MULTICAST_ERROR; - - ret = id_priv->id.event_handler(&id_priv->id, &event); - rdma_destroy_ah_attr(&event.param.ud.ah_attr); - if (ret) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); - return 0; - } + WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); @@ -4178,9 +4930,11 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, if (ret) return ret; - ret = cma_set_qkey(id_priv, 0); - if (ret) - return ret; + if (!id_priv->qkey) { + ret = cma_set_default_qkey(id_priv); + if (ret) + return ret; + } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); @@ -4188,17 +4942,6 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; - if ((rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) && - (!ib_sa_sendonly_fullmem_support(&sa_client, - id_priv->id.device, - id_priv->id.port_num))) { - dev_warn( - &id_priv->id.device->dev, - "RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n", - id_priv->id.port_num); - return -EOPNOTSUPP; - } - comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | @@ -4212,23 +4955,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; - mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, - id_priv->id.port_num, &rec, - comp_mask, GFP_KERNEL, - cma_ib_mc_handler, mc); - return PTR_ERR_OR_ZERO(mc->multicast.ib); -} - -static void iboe_mcast_work_handler(struct work_struct *work) -{ - struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); - struct cma_multicast *mc = mw->mc; - struct ib_sa_multicast *m = mc->multicast.ib; - - mc->multicast.ib->context = mc; - cma_ib_mc_handler(0, m); - kref_put(&mc->mcref, release_mc); - kfree(mw); + mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, + id_priv->id.port_num, &rec, comp_mask, + GFP_KERNEL, cma_ib_mc_handler, mc); + return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, @@ -4263,52 +4993,38 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { - struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + struct ib_sa_multicast ib = {}; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); - if (cma_zero_addr((struct sockaddr *)&mc->addr)) + if (cma_zero_addr(addr)) return -EINVAL; - work = kzalloc(sizeof *work, GFP_KERNEL); - if (!work) - return -ENOMEM; - - mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); - if (!mc->multicast.ib) { - err = -ENOMEM; - goto out1; - } - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); - - mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); - if (id_priv->id.ps == RDMA_PS_UDP) - mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); + ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (!ndev) { - err = -ENODEV; - goto out2; - } - mc->multicast.ib->rec.rate = iboe_get_rate(ndev); - mc->multicast.ib->rec.hop_limit = 1; - mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + if (!ndev) + return -ENODEV; + + ib.rec.rate = IB_RATE_PORT_CURRENT; + ib.rec.hop_limit = 1; + ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { - mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { - err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } @@ -4317,44 +5033,41 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -ENOTSUPP; } dev_put(ndev); - if (err || !mc->multicast.ib->rec.mtu) { - if (!err) - err = -EINVAL; - goto out2; - } - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, - &mc->multicast.ib->rec.port_gid); - work->id = id_priv; - work->mc = mc; - INIT_WORK(&work->work, iboe_mcast_work_handler); - kref_get(&mc->mcref); - queue_work(cma_wq, &work->work); + if (err || !ib.rec.mtu) + return err ?: -EINVAL; - return 0; + if (!id_priv->qkey) + cma_set_default_qkey(id_priv); -out2: - kfree(mc->multicast.ib); -out1: - kfree(work); - return err; + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &ib.rec.port_gid); + INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); + cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); + queue_work(cma_wq, &mc->iboe_join.work); + return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; - if (!id->device) + /* Not supported for kernel QPs */ + if (WARN_ON(id->qp)) return -EINVAL; - id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && - !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) + /* ULP is calling this wrong. */ + if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && + READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; - mc = kmalloc(sizeof *mc, GFP_KERNEL); + if (id_priv->id.qp_type != IB_QPT_UD) + return -EINVAL; + + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; @@ -4364,7 +5077,6 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { - kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; @@ -4396,25 +5108,14 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { - list_del(&mc->list); - spin_unlock_irq(&id_priv->lock); - - if (id->qp) - ib_detach_mcast(id->qp, - &mc->multicast.ib->rec.mgid, - be16_to_cpu(mc->multicast.ib->rec.mlid)); - - BUG_ON(id_priv->cma_dev->device != id->device); - - if (rdma_cap_ib_mcast(id->device, id->port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else if (rdma_protocol_roce(id->device, id->port_num)) { - cma_leave_roce_mc_group(id_priv, mc); - } - return; - } + if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) + continue; + list_del(&mc->list); + spin_unlock_irq(&id_priv->lock); + + WARN_ON(id_priv->cma_dev->device != id->device); + destroy_mc(id_priv, mc); + return; } spin_unlock_irq(&id_priv->lock); } @@ -4423,7 +5124,7 @@ EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; - struct cma_ndev_work *work; + struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; @@ -4436,10 +5137,10 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id if (!work) return -ENOMEM; - INIT_WORK(&work->work, cma_ndev_work_handler); + INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; - atomic_inc(&id_priv->refcount); + cma_id_get(id_priv); queue_work(cma_wq, &work->work); } @@ -4462,7 +5163,7 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) - list_for_each_entry(id_priv, &cma_dev->id_list, list) { + list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; @@ -4473,35 +5174,194 @@ out: return ret; } +static void cma_netevent_work_handler(struct work_struct *_work) +{ + struct rdma_id_private *id_priv = + container_of(_work, struct rdma_id_private, id.net_work); + struct rdma_cm_event event = {}; + + mutex_lock(&id_priv->handler_mutex); + + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + + if (cma_cm_event_handler(id_priv, &event)) { + __acquire(&id_priv->handler_mutex); + id_priv->cm_id.ib = NULL; + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + return; + } + +out_unlock: + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); +} + +static int cma_netevent_callback(struct notifier_block *self, + unsigned long event, void *ctx) +{ + struct id_table_entry *ips_node = NULL; + struct rdma_id_private *current_id; + struct neighbour *neigh = ctx; + unsigned long flags; + + if (event != NETEVENT_NEIGH_UPDATE) + return NOTIFY_DONE; + + spin_lock_irqsave(&id_table_lock, flags); + if (neigh->tbl->family == AF_INET6) { + struct sockaddr_in6 neigh_sock_6; + + neigh_sock_6.sin6_family = AF_INET6; + neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_6); + } else if (neigh->tbl->family == AF_INET) { + struct sockaddr_in neigh_sock_4; + + neigh_sock_4.sin_family = AF_INET; + neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_4); + } else + goto out; + + if (!ips_node) + goto out; + + list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { + if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, + neigh->ha, ETH_ALEN)) + continue; + cma_id_get(current_id); + if (!queue_work(cma_wq, ¤t_id->id.net_work)) + cma_id_put(current_id); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); + return NOTIFY_DONE; +} + static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; -static void cma_add_one(struct ib_device *device) +static struct notifier_block cma_netevent_cb = { + .notifier_call = cma_netevent_callback +}; + +static void cma_send_device_removal_put(struct rdma_id_private *id_priv) +{ + struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; + enum rdma_cm_state state; + unsigned long flags; + + mutex_lock(&id_priv->handler_mutex); + /* Record that we want to remove the device */ + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); + return; + } + id_priv->state = RDMA_CM_DEVICE_REMOVAL; + spin_unlock_irqrestore(&id_priv->lock, flags); + + if (cma_cm_event_handler(id_priv, &event)) { + /* + * At this point the ULP promises it won't call + * rdma_destroy_id() concurrently + */ + cma_id_put(id_priv); + mutex_unlock(&id_priv->handler_mutex); + trace_cm_id_destroy(id_priv); + _destroy_id(id_priv, state); + return; + } + mutex_unlock(&id_priv->handler_mutex); + + /* + * If this races with destroy then the thread that first assigns state + * to a destroying does the cancel. + */ + cma_cancel_operation(id_priv, state); + cma_id_put(id_priv); +} + +static void cma_process_remove(struct cma_device *cma_dev) { + mutex_lock(&lock); + while (!list_empty(&cma_dev->id_list)) { + struct rdma_id_private *id_priv = list_first_entry( + &cma_dev->id_list, struct rdma_id_private, device_item); + + list_del_init(&id_priv->listen_item); + list_del_init(&id_priv->device_item); + cma_id_get(id_priv); + mutex_unlock(&lock); + + cma_send_device_removal_put(id_priv); + + mutex_lock(&lock); + } + mutex_unlock(&lock); + + cma_dev_put(cma_dev); + wait_for_completion(&cma_dev->comp); +} + +static bool cma_supported(struct ib_device *device) +{ + u32 i; + + rdma_for_each_port(device, i) { + if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) + return true; + } + return false; +} + +static int cma_add_one(struct ib_device *device) +{ + struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; - unsigned int i; unsigned long supported_gids = 0; + int ret; + u32 i; + + if (!cma_supported(device)) + return -EOPNOTSUPP; - cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); + cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) - return; + return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); - if (!cma_dev->default_gid_type) + if (!cma_dev->default_gid_type) { + ret = -ENOMEM; goto free_cma_dev; + } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); - if (!cma_dev->default_roce_tos) + if (!cma_dev->default_roce_tos) { + ret = -ENOMEM; goto free_gid_type; + } - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) @@ -4514,86 +5374,42 @@ static void cma_add_one(struct ib_device *device) } init_completion(&cma_dev->comp); - atomic_set(&cma_dev->refcount, 1); + refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); - list_for_each_entry(id_priv, &listen_any_list, list) - cma_listen_on_dev(id_priv, cma_dev); + list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) + goto free_listen; + } mutex_unlock(&lock); - return; + trace_cm_add_one(device); + return 0; + +free_listen: + list_del(&cma_dev->list); + mutex_unlock(&lock); + /* cma_process_remove() will delete to_destroy */ + cma_process_remove(cma_dev); + kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); - - return; -} - -static int cma_remove_id_dev(struct rdma_id_private *id_priv) -{ - struct rdma_cm_event event = {}; - enum rdma_cm_state state; - int ret = 0; - - /* Record that we want to remove the device */ - state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); - if (state == RDMA_CM_DESTROYING) - return 0; - - cma_cancel_operation(id_priv, state); - mutex_lock(&id_priv->handler_mutex); - - /* Check for destruction from another callback. */ - if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) - goto out; - - event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; - ret = id_priv->id.event_handler(&id_priv->id, &event); -out: - mutex_unlock(&id_priv->handler_mutex); return ret; } -static void cma_process_remove(struct cma_device *cma_dev) -{ - struct rdma_id_private *id_priv; - int ret; - - mutex_lock(&lock); - while (!list_empty(&cma_dev->id_list)) { - id_priv = list_entry(cma_dev->id_list.next, - struct rdma_id_private, list); - - list_del(&id_priv->listen_list); - list_del_init(&id_priv->list); - atomic_inc(&id_priv->refcount); - mutex_unlock(&lock); - - ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); - cma_deref_id(id_priv); - if (ret) - rdma_destroy_id(&id_priv->id); - - mutex_lock(&lock); - } - mutex_unlock(&lock); - - cma_deref_dev(cma_dev); - wait_for_completion(&cma_dev->comp); -} - static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; - if (!cma_dev) - return; + trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); @@ -4605,93 +5421,14 @@ static void cma_remove_one(struct ib_device *device, void *client_data) kfree(cma_dev); } -static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct nlmsghdr *nlh; - struct rdma_cm_id_stats *id_stats; - struct rdma_id_private *id_priv; - struct rdma_cm_id *id = NULL; - struct cma_device *cma_dev; - int i_dev = 0, i_id = 0; - - /* - * We export all of the IDs as a sequence of messages. Each - * ID gets its own netlink message. - */ - mutex_lock(&lock); - - list_for_each_entry(cma_dev, &dev_list, list) { - if (i_dev < cb->args[0]) { - i_dev++; - continue; - } - - i_id = 0; - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - if (i_id < cb->args[1]) { - i_id++; - continue; - } - - id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, - sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS, - NLM_F_MULTI); - if (!id_stats) - goto out; - - memset(id_stats, 0, sizeof *id_stats); - id = &id_priv->id; - id_stats->node_type = id->route.addr.dev_addr.dev_type; - id_stats->port_num = id->port_num; - id_stats->bound_dev_if = - id->route.addr.dev_addr.bound_dev_if; - - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), - cma_src_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) - goto out; - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_dst_addr(id_priv)), - cma_dst_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) - goto out; - - id_stats->pid = task_pid_vnr(id_priv->res.task); - id_stats->port_space = id->ps; - id_stats->cm_state = id_priv->state; - id_stats->qp_num = id_priv->qp_num; - id_stats->qp_type = id->qp_type; - - i_id++; - nlmsg_end(skb, nlh); - } - - cb->args[1] = 0; - i_dev++; - } - -out: - mutex_unlock(&lock); - cb->args[0] = i_dev; - cb->args[1] = i_id; - - return skb->len; -} - -static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, -}; - static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); - idr_init(&pernet->tcp_ps); - idr_init(&pernet->udp_ps); - idr_init(&pernet->ipoib_ps); - idr_init(&pernet->ib_ps); + xa_init(&pernet->tcp_ps); + xa_init(&pernet->udp_ps); + xa_init(&pernet->ipoib_ps); + xa_init(&pernet->ib_ps); return 0; } @@ -4700,10 +5437,10 @@ static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); - idr_destroy(&pernet->tcp_ps); - idr_destroy(&pernet->udp_ps); - idr_destroy(&pernet->ipoib_ps); - idr_destroy(&pernet->ib_ps); + WARN_ON(!xa_empty(&pernet->tcp_ps)); + WARN_ON(!xa_empty(&pernet->udp_ps)); + WARN_ON(!xa_empty(&pernet->ipoib_ps)); + WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { @@ -4717,6 +5454,19 @@ static int __init cma_init(void) { int ret; + /* + * There is a rare lock ordering dependency in cma_netdev_callback() + * that only happens when bonding is enabled. Teach lockdep that rtnl + * must never be nested under lock so it can find these without having + * to test with bonding. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + rtnl_lock(); + mutex_lock(&lock); + mutex_unlock(&lock); + rtnl_unlock(); + } + cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; @@ -4727,19 +5477,25 @@ static int __init cma_init(void) ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); + register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; - rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); - cma_configfs_init(); + ret = cma_configfs_init(); + if (ret) + goto err_ib; return 0; +err_ib: + ib_unregister_client(&cma_client); err: + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); + unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; @@ -4748,15 +5504,139 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); } -MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); - module_init(cma_init); module_exit(cma_cleanup); + +static void cma_query_ib_service_handler(int status, + struct sa_service_rec *recs, + unsigned int num_recs, void *context) +{ + struct cma_work *work = context; + struct rdma_id_private *id_priv = work->id; + struct sockaddr_ib *addr; + + if (status) + goto fail; + + if (!num_recs) { + status = -ENOENT; + goto fail; + } + + if (id_priv->id.route.service_recs) { + status = -EALREADY; + goto fail; + } + + id_priv->id.route.service_recs = + kmalloc_array(num_recs, sizeof(*recs), GFP_KERNEL); + if (!id_priv->id.route.service_recs) { + status = -ENOMEM; + goto fail; + } + + id_priv->id.route.num_service_recs = num_recs; + memcpy(id_priv->id.route.service_recs, recs, sizeof(*recs) * num_recs); + + addr = (struct sockaddr_ib *)&id_priv->id.route.addr.dst_addr; + addr->sib_family = AF_IB; + addr->sib_addr = *(struct ib_addr *)&recs->gid; + addr->sib_pkey = recs->pkey; + addr->sib_sid = recs->id; + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, + (union ib_gid *)&addr->sib_addr); + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, + ntohs(addr->sib_pkey)); + + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDR_BOUND; + work->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR; + work->event.status = status; + pr_debug_ratelimited( + "RDMA CM: SERVICE_ERROR: failed to query service record. status %d\n", + status); + queue_work(cma_wq, &work->work); +} + +static int cma_resolve_ib_service(struct rdma_id_private *id_priv, + struct rdma_ucm_ib_service *ibs) +{ + struct sa_service_rec sr = {}; + ib_sa_comp_mask mask = 0; + struct cma_work *work; + + work = kzalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return -ENOMEM; + + cma_id_get(id_priv); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDRINFO_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED; + + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_ID) { + sr.id = cpu_to_be64(ibs->service_id); + mask |= IB_SA_SERVICE_REC_SERVICE_ID; + } + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_NAME) { + strscpy(sr.name, ibs->service_name, sizeof(sr.name)); + mask |= IB_SA_SERVICE_REC_SERVICE_NAME; + } + + id_priv->query_id = ib_sa_service_rec_get(&sa_client, + id_priv->id.device, + id_priv->id.port_num, + &sr, mask, + 2000, GFP_KERNEL, + cma_query_ib_service_handler, + work, &id_priv->query); + + if (id_priv->query_id < 0) { + cma_id_put(id_priv); + kfree(work); + return id_priv->query_id; + } + + return 0; +} + +int rdma_resolve_ib_service(struct rdma_cm_id *id, + struct rdma_ucm_ib_service *ibs) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cma_dev || + !cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDRINFO_QUERY)) + return -EINVAL; + + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_service(id_priv, ibs); + else + ret = -EOPNOTSUPP; + + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ib_service); diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 3ec2c415bb70..f2fb2d8a6597 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include <linux/module.h> #include <linux/configfs.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> @@ -43,7 +42,7 @@ struct cma_device; struct cma_dev_group; struct cma_dev_port_group { - unsigned int port_num; + u32 port_num; struct cma_dev_group *cma_dev_group; struct config_group group; }; @@ -94,7 +93,7 @@ static int cma_configfs_params_get(struct config_item *item, static void cma_configfs_params_put(struct cma_device *cma_dev) { - cma_deref_dev(cma_dev); + cma_dev_put(cma_dev); } static ssize_t default_roce_mode_show(struct config_item *item, @@ -115,7 +114,7 @@ static ssize_t default_roce_mode_show(struct config_item *item, if (gid_type < 0) return gid_type; - return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type)); + return sysfs_emit(buf, "%s\n", ib_cache_gid_type_str(gid_type)); } static ssize_t default_roce_mode_store(struct config_item *item, @@ -123,16 +122,19 @@ static ssize_t default_roce_mode_store(struct config_item *item, { struct cma_device *cma_dev; struct cma_dev_port_group *group; - int gid_type = ib_cache_gid_parse_type_str(buf); + int gid_type; ssize_t ret; - if (gid_type < 0) - return -EINVAL; - ret = cma_configfs_params_get(item, &cma_dev, &group); if (ret) return ret; + gid_type = ib_cache_gid_parse_type_str(buf); + if (gid_type < 0) { + cma_configfs_params_put(cma_dev); + return -EINVAL; + } + ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); cma_configfs_params_put(cma_dev); @@ -156,7 +158,7 @@ static ssize_t default_roce_tos_show(struct config_item *item, char *buf) tos = cma_get_default_roce_tos(cma_dev, group->port_num); cma_configfs_params_put(cma_dev); - return sprintf(buf, "%u\n", tos); + return sysfs_emit(buf, "%u\n", tos); } static ssize_t default_roce_tos_store(struct config_item *item, @@ -197,11 +199,10 @@ static const struct config_item_type cma_port_group_type = { static int make_cma_ports(struct cma_dev_group *cma_dev_group, struct cma_device *cma_dev) { - struct ib_device *ibdev; - unsigned int i; - unsigned int ports_num; struct cma_dev_port_group *ports; - int err; + struct ib_device *ibdev; + u32 ports_num; + u32 i; ibdev = cma_get_ib_dev(cma_dev); @@ -212,13 +213,11 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports), GFP_KERNEL); - if (!ports) { - err = -ENOMEM; - goto free; - } + if (!ports) + return -ENOMEM; for (i = 0; i < ports_num; i++) { - char port_str[10]; + char port_str[11]; ports[i].port_num = i + 1; snprintf(port_str, sizeof(port_str), "%u", i + 1); @@ -231,12 +230,7 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, } cma_dev_group->ports = ports; - return 0; -free: - kfree(ports); - cma_dev_group->ports = NULL; - return err; } static void release_cma_dev(struct config_item *item) @@ -298,7 +292,7 @@ static struct config_group *make_cma_dev(struct config_group *group, goto fail; } - strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + strscpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); config_group_init_type_name(&cma_dev_group->ports_group, "ports", &cma_ports_group_type); @@ -312,18 +306,31 @@ static struct config_group *make_cma_dev(struct config_group *group, configfs_add_default_group(&cma_dev_group->ports_group, &cma_dev_group->device_group); - cma_deref_dev(cma_dev); + cma_dev_put(cma_dev); return &cma_dev_group->device_group; fail: if (cma_dev) - cma_deref_dev(cma_dev); + cma_dev_put(cma_dev); kfree(cma_dev_group); return ERR_PTR(err); } +static void drop_cma_dev(struct config_group *cgroup, struct config_item *item) +{ + struct config_group *group = + container_of(item, struct config_group, cg_item); + struct cma_dev_group *cma_dev_group = + container_of(group, struct cma_dev_group, device_group); + + configfs_remove_default_groups(&cma_dev_group->ports_group); + configfs_remove_default_groups(&cma_dev_group->device_group); + config_item_put(item); +} + static struct configfs_group_operations cma_subsys_group_ops = { .make_group = make_cma_dev, + .drop_item = drop_cma_dev, }; static const struct config_item_type cma_subsys_type = { @@ -342,12 +349,18 @@ static struct configfs_subsystem cma_subsys = { int __init cma_configfs_init(void) { + int ret; + config_group_init(&cma_subsys.su_group); mutex_init(&cma_subsys.su_mutex); - return configfs_register_subsystem(&cma_subsys); + ret = configfs_register_subsystem(&cma_subsys); + if (ret) + mutex_destroy(&cma_subsys.su_mutex); + return ret; } void __exit cma_configfs_exit(void) { configfs_unregister_subsystem(&cma_subsys); + mutex_destroy(&cma_subsys.su_mutex); } diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index cf47c69436a7..c604b601f4d9 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -47,7 +47,9 @@ enum rdma_cm_state { RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN, RDMA_CM_DEVICE_REMOVAL, - RDMA_CM_DESTROYING + RDMA_CM_DESTROYING, + RDMA_CM_ADDRINFO_QUERY, + RDMA_CM_ADDRINFO_RESOLVED }; struct rdma_id_private { @@ -55,8 +57,16 @@ struct rdma_id_private { struct rdma_bind_list *bind_list; struct hlist_node node; - struct list_head list; /* listen_any_list or cma_device.list */ - struct list_head listen_list; /* per device listens */ + union { + struct list_head device_item; /* On cma_device->id_list */ + struct list_head listen_any_item; /* On listen_any_list */ + }; + union { + /* On rdma_id_private->listen_list */ + struct list_head listen_item; + struct list_head listen_list; + }; + struct list_head id_list_entry; struct cma_device *cma_dev; struct list_head mc_list; @@ -66,7 +76,7 @@ struct rdma_id_private { struct mutex qp_mutex; struct completion comp; - atomic_t refcount; + refcount_t refcount; struct mutex handler_mutex; int backlog; @@ -84,15 +94,21 @@ struct rdma_id_private { u32 options; u8 srq; u8 tos; - bool tos_set; + u8 tos_set:1; + u8 timeout_set:1; + u8 min_rnr_timer_set:1; u8 reuseaddr; u8 afonly; + u8 timeout; + u8 min_rnr_timer; + u8 used_resolve_ip; enum ib_gid_type gid_type; /* * Internal to RDMA/core, don't use in the drivers */ struct rdma_restrack_entry res; + struct rdma_ucm_ece ece; }; #if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) @@ -109,16 +125,16 @@ static inline void cma_configfs_exit(void) } #endif -void cma_ref_dev(struct cma_device *dev); -void cma_deref_dev(struct cma_device *dev); +void cma_dev_get(struct cma_device *dev); +void cma_dev_put(struct cma_device *dev); typedef bool (*cma_device_filter)(struct ib_device *, void *); struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie); -int cma_get_default_gid_type(struct cma_device *dev, unsigned int port); -int cma_set_default_gid_type(struct cma_device *dev, unsigned int port, +int cma_get_default_gid_type(struct cma_device *dev, u32 port); +int cma_set_default_gid_type(struct cma_device *dev, u32 port, enum ib_gid_type default_gid_type); -int cma_get_default_roce_tos(struct cma_device *dev, unsigned int port); -int cma_set_default_roce_tos(struct cma_device *dev, unsigned int port, +int cma_get_default_roce_tos(struct cma_device *dev, u32 port); +int cma_set_default_roce_tos(struct cma_device *dev, u32 port, u8 default_roce_tos); struct ib_device *cma_get_ib_dev(struct cma_device *dev); diff --git a/drivers/infiniband/core/cma_trace.c b/drivers/infiniband/core/cma_trace.c new file mode 100644 index 000000000000..b314a281e10e --- /dev/null +++ b/drivers/infiniband/core/cma_trace.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for the RDMA Connection Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ + +#define CREATE_TRACE_POINTS + +#include <rdma/rdma_cm.h> +#include <rdma/ib_cm.h> +#include "cma_priv.h" + +#include "cma_trace.h" diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h new file mode 100644 index 000000000000..3456d5f3aa47 --- /dev/null +++ b/drivers/infiniband/core/cma_trace.h @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trace point definitions for the RDMA Connect Manager. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rdma_cma + +#if !defined(_TRACE_RDMA_CMA_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_RDMA_CMA_H + +#include <linux/tracepoint.h> +#include <trace/misc/rdma.h> + + +DECLARE_EVENT_CLASS(cma_fsm_class, + TP_PROTO( + const struct rdma_id_private *id_priv + ), + + TP_ARGS(id_priv), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos + ) +); + +#define DEFINE_CMA_FSM_EVENT(name) \ + DEFINE_EVENT(cma_fsm_class, cm_##name, \ + TP_PROTO( \ + const struct rdma_id_private *id_priv \ + ), \ + TP_ARGS(id_priv)) + +DEFINE_CMA_FSM_EVENT(send_rtu); +DEFINE_CMA_FSM_EVENT(send_rej); +DEFINE_CMA_FSM_EVENT(prepare_mra); +DEFINE_CMA_FSM_EVENT(send_sidr_req); +DEFINE_CMA_FSM_EVENT(send_sidr_rep); +DEFINE_CMA_FSM_EVENT(disconnect); +DEFINE_CMA_FSM_EVENT(sent_drep); +DEFINE_CMA_FSM_EVENT(sent_dreq); +DEFINE_CMA_FSM_EVENT(id_destroy); + +TRACE_EVENT(cm_id_attach, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct ib_device *device + ), + + TP_ARGS(id_priv, device), + + TP_STRUCT__entry( + __field(u32, cm_id) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + __string(devname, device->name) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + __assign_str(devname); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, + __get_str(devname) + ) +); + +DECLARE_EVENT_CLASS(cma_qp_class, + TP_PROTO( + const struct rdma_id_private *id_priv + ), + + TP_ARGS(id_priv), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(u32, qp_num) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->qp_num = id_priv->qp_num; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u qp_num=%u", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + __entry->qp_num + ) +); + +#define DEFINE_CMA_QP_EVENT(name) \ + DEFINE_EVENT(cma_qp_class, cm_##name, \ + TP_PROTO( \ + const struct rdma_id_private *id_priv \ + ), \ + TP_ARGS(id_priv)) + +DEFINE_CMA_QP_EVENT(send_req); +DEFINE_CMA_QP_EVENT(send_rep); +DEFINE_CMA_QP_EVENT(qp_destroy); + +/* + * enum ib_wp_type, from include/rdma/ib_verbs.h + */ +#define IB_QP_TYPE_LIST \ + ib_qp_type(SMI) \ + ib_qp_type(GSI) \ + ib_qp_type(RC) \ + ib_qp_type(UC) \ + ib_qp_type(UD) \ + ib_qp_type(RAW_IPV6) \ + ib_qp_type(RAW_ETHERTYPE) \ + ib_qp_type(RAW_PACKET) \ + ib_qp_type(XRC_INI) \ + ib_qp_type_end(XRC_TGT) + +#undef ib_qp_type +#undef ib_qp_type_end + +#define ib_qp_type(x) TRACE_DEFINE_ENUM(IB_QPT_##x); +#define ib_qp_type_end(x) TRACE_DEFINE_ENUM(IB_QPT_##x); + +IB_QP_TYPE_LIST + +#undef ib_qp_type +#undef ib_qp_type_end + +#define ib_qp_type(x) { IB_QPT_##x, #x }, +#define ib_qp_type_end(x) { IB_QPT_##x, #x } + +#define rdma_show_qp_type(x) \ + __print_symbolic(x, IB_QP_TYPE_LIST) + + +TRACE_EVENT(cm_qp_create, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct ib_pd *pd, + const struct ib_qp_init_attr *qp_init_attr, + int rc + ), + + TP_ARGS(id_priv, pd, qp_init_attr, rc), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, pd_id) + __field(u32, tos) + __field(u32, qp_num) + __field(u32, send_wr) + __field(u32, recv_wr) + __field(int, rc) + __field(unsigned long, qp_type) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->pd_id = pd->res.id; + __entry->tos = id_priv->tos; + __entry->send_wr = qp_init_attr->cap.max_send_wr; + __entry->recv_wr = qp_init_attr->cap.max_recv_wr; + __entry->rc = rc; + if (!rc) { + __entry->qp_num = id_priv->qp_num; + __entry->qp_type = id_priv->id.qp_type; + } else { + __entry->qp_num = 0; + __entry->qp_type = 0; + } + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u pd.id=%u qp_type=%s" + " send_wr=%u recv_wr=%u qp_num=%u rc=%d", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, + __entry->tos, __entry->pd_id, + rdma_show_qp_type(__entry->qp_type), __entry->send_wr, + __entry->recv_wr, __entry->qp_num, __entry->rc + ) +); + +TRACE_EVENT(cm_req_handler, + TP_PROTO( + const struct rdma_id_private *id_priv, + int event + ), + + TP_ARGS(id_priv, event), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(unsigned long, event) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->event = event; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu)", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + rdma_show_ib_cm_event(__entry->event), __entry->event + ) +); + +TRACE_EVENT(cm_event_handler, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct rdma_cm_event *event + ), + + TP_ARGS(id_priv, event), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(unsigned long, event) + __field(int, status) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->event = event->event; + __entry->status = event->status; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu/%d)", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + rdma_show_cm_event(__entry->event), __entry->event, + __entry->status + ) +); + +TRACE_EVENT(cm_event_done, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct rdma_cm_event *event, + int result + ), + + TP_ARGS(id_priv, event, result), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(unsigned long, event) + __field(int, result) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->event = event->event; + __entry->result = result; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s consumer returns %d", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + rdma_show_cm_event(__entry->event), __entry->result + ) +); + +DECLARE_EVENT_CLASS(cma_client_class, + TP_PROTO( + const struct ib_device *device + ), + + TP_ARGS(device), + + TP_STRUCT__entry( + __string(name, device->name) + ), + + TP_fast_assign( + __assign_str(name); + ), + + TP_printk("device name=%s", + __get_str(name) + ) +); + +#define DEFINE_CMA_CLIENT_EVENT(name) \ + DEFINE_EVENT(cma_client_class, cm_##name, \ + TP_PROTO( \ + const struct ib_device *device \ + ), \ + TP_ARGS(device)) + +DEFINE_CMA_CLIENT_EVENT(add_one); +DEFINE_CMA_CLIENT_EVENT(remove_one); + +#endif /* _TRACE_RDMA_CMA_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE cma_trace + +#include <trace/define_trace.h> diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 3cd830d52967..05102769a918 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -36,12 +36,15 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/cgroup_rdma.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <rdma/ib_verbs.h> #include <rdma/opa_addr.h> #include <rdma/ib_mad.h> #include <rdma/restrack.h> #include "mad_priv.h" +#include "restrack.h" /* Total number of ports combined across all struct ib_devices's */ #define RDMA_MAX_PORTS 8192 @@ -54,18 +57,39 @@ struct pkey_index_qp_list { struct list_head qp_list; }; -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); -void ib_device_unregister_sysfs(struct ib_device *device); +/** + * struct rdma_dev_net - rdma net namespace metadata for a net + * @nl_sock: Pointer to netlink socket + * @net: Pointer to owner net namespace + * @id: xarray id to identify the net namespace. + */ +struct rdma_dev_net { + struct sock *nl_sock; + possible_net_t net; + u32 id; +}; + +extern const struct attribute_group ib_dev_attr_group; +extern bool ib_devices_shared_netns; +extern unsigned int rdma_dev_net_id; + +static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net) +{ + return net_generic(net, rdma_dev_net_id); +} + int ib_device_rename(struct ib_device *ibdev, const char *name); +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim); -typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, +typedef void (*roce_netdev_callback)(struct ib_device *device, u32 port, struct net_device *idev, void *cookie); -typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port, +typedef bool (*roce_netdev_filter)(struct ib_device *device, u32 port, struct net_device *idev, void *cookie); +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + u32 port); + void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, @@ -84,6 +108,15 @@ typedef int (*nldev_callback)(struct ib_device *device, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb); +struct ib_client_nl_info { + struct sk_buff *nl_msg; + struct device *cdev; + u32 port; + u64 abi; +}; +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res); + enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE @@ -93,31 +126,32 @@ int ib_cache_gid_parse_type_str(const char *buf); const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); -void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode); -int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr); -int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr); -int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev); int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); -unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port); int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); +void ib_dispatch_event_clients(struct ib_event *event); #ifdef CONFIG_CGROUP_RDMA -int ib_device_register_rdmacg(struct ib_device *device); +void ib_device_register_rdmacg(struct ib_device *device); void ib_device_unregister_rdmacg(struct ib_device *device); int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, @@ -128,21 +162,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index); #else -static inline int ib_device_register_rdmacg(struct ib_device *device) -{ return 0; } +static inline void ib_device_register_rdmacg(struct ib_device *device) +{ +} static inline void ib_device_unregister_rdmacg(struct ib_device *device) -{ } +{ +} static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ return 0; } +{ + return 0; +} static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ } +{ +} #endif static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, @@ -160,7 +199,7 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); -int rdma_nl_init(void); +void rdma_nl_init(void); void rdma_nl_exit(void); int ib_nl_handle_resolve_resp(struct sk_buff *skb, @@ -173,15 +212,15 @@ int ib_nl_handle_ip_res_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); -int ib_get_cached_subnet_prefix(struct ib_device *device, - u8 port_num, - u64 *sn_pfx); +void ib_get_cached_subnet_prefix(struct ib_device *device, + u32 port_num, + u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND -void ib_security_destroy_port_pkey_list(struct ib_device *device); +void ib_security_release_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix); int ib_security_modify_qp(struct ib_qp *qp, @@ -199,13 +238,14 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type); void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); +void ib_mad_agent_security_change(void); #else -static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) +static inline void ib_security_release_port_pkey_list(struct ib_device *device) { } static inline void ib_security_cache_change(struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix) { } @@ -264,53 +304,27 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, { return 0; } + +static inline void ib_mad_agent_security_change(void) +{ +} #endif -struct ib_device *ib_device_get_by_index(u32 ifindex); -void ib_device_put(struct ib_device *device); +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index); + /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); -static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, - struct ib_pd *pd, - struct ib_qp_init_attr *attr, - struct ib_udata *udata, - struct ib_uobject *uobj) -{ - struct ib_qp *qp; - - if (!dev->ops.create_qp) - return ERR_PTR(-EOPNOTSUPP); - - qp = dev->ops.create_qp(pd, attr, udata); - if (IS_ERR(qp)) - return qp; - - qp->device = dev; - qp->pd = pd; - qp->uobject = uobj; - /* - * We don't track XRC QPs for now, because they don't have PD - * and more importantly they are created internaly by driver, - * see mlx5 create_dev_resources() as an example. - */ - if (attr->qp_type < IB_QPT_XRC_INI) { - qp->res.type = RDMA_RESTRACK_QP; - if (uobj) - rdma_restrack_uadd(&qp->res); - else - rdma_restrack_kadd(&qp->res); - } else - qp->res.valid = false; - - return qp; -} +struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller); + +void ib_qp_usecnt_inc(struct ib_qp *qp); +void ib_qp_usecnt_dec(struct ib_qp *qp); struct rdma_dev_addr; -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, @@ -324,4 +338,37 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr); struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); + +void ib_free_port_attrs(struct ib_core_device *coredev); +int ib_setup_port_attrs(struct ib_core_device *coredev); +struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, u32 port_num); +void ib_device_release_hw_stats(struct hw_stats_device_data *data); +int ib_setup_device_attrs(struct ib_device *ibdev); + +int rdma_compatdev_set(u8 enable); + +int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups); +void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups); + +int ib_device_set_netns_put(struct sk_buff *skb, + struct ib_device *dev, u32 ns_fd); + +int rdma_nl_net_init(struct rdma_dev_net *rnet); +void rdma_nl_net_exit(struct rdma_dev_net *rnet); + +struct rdma_umap_priv { + struct vm_area_struct *vma; + struct list_head list; + struct rdma_user_mmap_entry *entry; +}; + +void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma, + struct rdma_user_mmap_entry *entry); + +void ib_cq_pool_cleanup(struct ib_device *dev); + +bool rdma_nl_get_privileged_qkey(void); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c new file mode 100644 index 000000000000..c3aa6d7fc66b --- /dev/null +++ b/drivers/infiniband/core/counters.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + */ +#include <rdma/ib_verbs.h> +#include <rdma/rdma_counter.h> + +#include "core_priv.h" +#include "restrack.h" + +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) + +static int __counter_set_mode(struct rdma_port_counter *port_counter, + enum rdma_nl_counter_mode new_mode, + enum rdma_nl_counter_mask new_mask, + bool bind_opcnt) +{ + if (new_mode == RDMA_COUNTER_MODE_AUTO) { + if (new_mask & (~ALL_AUTO_MODE_MASKS)) + return -EINVAL; + if (port_counter->num_counters) + return -EBUSY; + } + + port_counter->mode.mode = new_mode; + port_counter->mode.mask = new_mask; + port_counter->mode.bind_opcnt = bind_opcnt; + return 0; +} + +/* + * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode + * + * @dev: Device to operate + * @port: Port to use + * @mask: Mask to configure + * @extack: Message to the user + * + * Return 0 on success. If counter mode wasn't changed then it is considered + * as success as well. + * Return -EBUSY when changing to auto mode while there are bounded counters. + * + */ +int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, + enum rdma_nl_counter_mask mask, + bool bind_opcnt, + struct netlink_ext_ack *extack) +{ + struct rdma_port_counter *port_counter; + enum rdma_nl_counter_mode mode; + int ret; + + port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return -EOPNOTSUPP; + + mutex_lock(&port_counter->lock); + if (mask) + mode = RDMA_COUNTER_MODE_AUTO; + else + mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : + RDMA_COUNTER_MODE_NONE; + + if (port_counter->mode.mode == mode && + port_counter->mode.mask == mask && + port_counter->mode.bind_opcnt == bind_opcnt) { + ret = 0; + goto out; + } + + ret = __counter_set_mode(port_counter, mode, mask, bind_opcnt); + +out: + mutex_unlock(&port_counter->lock); + if (ret == -EBUSY) + NL_SET_ERR_MSG( + extack, + "Modifying auto mode is not allowed when there is a bound QP"); + return ret; +} + +static void auto_mode_init_counter(struct rdma_counter *counter, + const struct ib_qp *qp, + enum rdma_nl_counter_mask new_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + + counter->mode.mode = RDMA_COUNTER_MODE_AUTO; + counter->mode.mask = new_mask; + + if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) + param->qp_type = qp->qp_type; +} + +static int __rdma_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp, u32 port) +{ + int ret; + + if (qp->counter) + return -EINVAL; + + if (!qp->device->ops.counter_bind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_bind_qp(counter, qp, port); + mutex_unlock(&counter->lock); + + return ret; +} + +int rdma_counter_modify(struct ib_device *dev, u32 port, + unsigned int index, bool enable) +{ + struct rdma_hw_stats *stats; + int ret = 0; + + if (!dev->ops.modify_hw_stat) + return -EOPNOTSUPP; + + stats = ib_get_hw_stats_port(dev, port); + if (!stats || index >= stats->num_counters || + !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) + return -EINVAL; + + mutex_lock(&stats->lock); + + if (enable != test_bit(index, stats->is_disabled)) + goto out; + + ret = dev->ops.modify_hw_stat(dev, port, index, enable); + if (ret) + goto out; + + if (enable) + clear_bit(index, stats->is_disabled); + else + set_bit(index, stats->is_disabled); +out: + mutex_unlock(&stats->lock); + return ret; +} + +static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, + struct ib_qp *qp, + enum rdma_nl_counter_mode mode, + bool bind_opcnt) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter; + int ret; + + if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) + return NULL; + + counter = rdma_zalloc_drv_obj(dev, rdma_counter); + if (!counter) + return NULL; + + counter->device = dev; + counter->port = port; + + dev->ops.counter_init(counter); + + rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); + counter->stats = dev->ops.counter_alloc_stats(counter); + if (!counter->stats) + goto err_stats; + + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + switch (mode) { + case RDMA_COUNTER_MODE_MANUAL: + ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, + 0, bind_opcnt); + if (ret) { + mutex_unlock(&port_counter->lock); + goto err_mode; + } + break; + case RDMA_COUNTER_MODE_AUTO: + auto_mode_init_counter(counter, qp, port_counter->mode.mask); + break; + default: + ret = -EOPNOTSUPP; + mutex_unlock(&port_counter->lock); + goto err_mode; + } + + port_counter->num_counters++; + mutex_unlock(&port_counter->lock); + + counter->mode.mode = mode; + counter->mode.bind_opcnt = bind_opcnt; + kref_init(&counter->kref); + mutex_init(&counter->lock); + + ret = __rdma_counter_bind_qp(counter, qp, port); + if (ret) + goto err_mode; + + rdma_restrack_parent_name(&counter->res, &qp->res); + rdma_restrack_add(&counter->res); + return counter; + +err_mode: + rdma_free_hw_stats_struct(counter->stats); +err_stats: + rdma_restrack_put(&counter->res); + kfree(counter); + return NULL; +} + +static void rdma_counter_free(struct rdma_counter *counter) +{ + struct rdma_port_counter *port_counter; + + port_counter = &counter->device->port_data[counter->port].port_counter; + mutex_lock(&port_counter->lock); + port_counter->num_counters--; + if (!port_counter->num_counters && + (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) + __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0, + false); + + mutex_unlock(&port_counter->lock); + + rdma_restrack_del(&counter->res); + rdma_free_hw_stats_struct(counter->stats); + kfree(counter); +} + +static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, + enum rdma_nl_counter_mask auto_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + bool match = true; + + if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) + match &= (param->qp_type == qp->qp_type); + + if (auto_mask & RDMA_COUNTER_MASK_PID) + match &= (task_pid_nr(counter->res.task) == + task_pid_nr(qp->res.task)); + + return match; +} + +static int __rdma_counter_unbind_qp(struct ib_qp *qp, u32 port) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!qp->device->ops.counter_unbind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_unbind_qp(qp, port); + mutex_unlock(&counter->lock); + + return ret; +} + +static void counter_history_stat_update(struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + struct rdma_port_counter *port_counter; + int i; + + port_counter = &dev->port_data[counter->port].port_counter; + if (!port_counter->hstats) + return; + + rdma_counter_query_stats(counter); + + for (i = 0; i < counter->stats->num_counters; i++) + port_counter->hstats->value[i] += counter->stats->value[i]; +} + +/* + * rdma_get_counter_auto_mode - Find the counter that @qp should be bound + * with in auto mode + * + * Return: The counter (with ref-count increased) if found + */ +static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, + u32 port) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter = NULL; + struct ib_device *dev = qp->device; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + unsigned long id = 0; + + port_counter = &dev->port_data[port].port_counter; + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != qp->device) || (counter->port != port)) + goto next; + + if (auto_mode_match(qp, counter, port_counter->mode.mask)) + break; +next: + counter = NULL; + } + + if (counter && !kref_get_unless_zero(&counter->kref)) + counter = NULL; + + xa_unlock(&rt->xa); + return counter; +} + +static void counter_release(struct kref *kref) +{ + struct rdma_counter *counter; + + counter = container_of(kref, struct rdma_counter, kref); + counter_history_stat_update(counter); + counter->device->ops.counter_dealloc(counter); + rdma_counter_free(counter); +} + +/* + * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on + * the auto-mode rule + */ +int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) +{ + struct rdma_port_counter *port_counter; + struct ib_device *dev = qp->device; + struct rdma_counter *counter; + int ret; + + if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) + return 0; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + port_counter = &dev->port_data[port].port_counter; + if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) + return 0; + + counter = rdma_get_counter_auto_mode(qp, port); + if (counter) { + ret = __rdma_counter_bind_qp(counter, qp, port); + if (ret) { + kref_put(&counter->kref, counter_release); + return ret; + } + } else { + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO, + port_counter->mode.bind_opcnt); + if (!counter) + return -ENOMEM; + } + + return 0; +} + +/* + * rdma_counter_unbind_qp - Unbind a qp from a counter + * @force: + * true - Decrease the counter ref-count anyway (e.g., qp destroy) + */ +int rdma_counter_unbind_qp(struct ib_qp *qp, u32 port, bool force) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!counter) + return -EINVAL; + + ret = __rdma_counter_unbind_qp(qp, port); + if (ret && !force) + return ret; + + kref_put(&counter->kref, counter_release); + return 0; +} + +int rdma_counter_query_stats(struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + int ret; + + if (!dev->ops.counter_update_stats) + return -EINVAL; + + mutex_lock(&counter->lock); + ret = dev->ops.counter_update_stats(counter); + mutex_unlock(&counter->lock); + + return ret; +} + +static u64 get_running_counters_hwstat_sum(struct ib_device *dev, + u32 port, u32 index) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct rdma_counter *counter; + unsigned long id = 0; + u64 sum = 0; + + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + xa_unlock(&rt->xa); + + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != dev) || (counter->port != port) || + rdma_counter_query_stats(counter)) + goto next; + + sum += counter->stats->value[index]; + +next: + xa_lock(&rt->xa); + rdma_restrack_put(res); + } + + xa_unlock(&rt->xa); + return sum; +} + +/* + * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a + * specific port, including the running ones and history data + */ +u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) +{ + struct rdma_port_counter *port_counter; + u64 sum; + + port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return 0; + + sum = get_running_counters_hwstat_sum(dev, port, index); + sum += port_counter->hstats->value[index]; + + return sum; +} + +static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) +{ + struct rdma_restrack_entry *res = NULL; + struct ib_qp *qp = NULL; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); + if (IS_ERR(res)) + return NULL; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !rdma_dev_has_raw_cap(dev)) + goto err; + + return qp; + +err: + rdma_restrack_put(res); + return NULL; +} + +static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, + u32 counter_id) +{ + struct rdma_restrack_entry *res; + struct rdma_counter *counter; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); + if (IS_ERR(res)) + return NULL; + + counter = container_of(res, struct rdma_counter, res); + kref_get(&counter->kref); + rdma_restrack_put(res); + + return counter; +} + +/* + * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id + */ +int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + port_counter = &dev->port_data[port].port_counter; + if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + counter = rdma_get_counter_by_id(dev, counter_id); + if (!counter) { + ret = -ENOENT; + goto err; + } + + if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { + ret = -EINVAL; + goto err_task; + } + + if ((counter->device != qp->device) || (counter->port != qp->port)) { + ret = -EINVAL; + goto err_task; + } + + ret = __rdma_counter_bind_qp(counter, qp, port); + if (ret) + goto err_task; + + rdma_restrack_put(&qp->res); + return 0; + +err_task: + kref_put(&counter->kref, counter_release); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/* + * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it + * The id of new counter is returned in @counter_id + */ +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, + u32 qp_num, u32 *counter_id) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return -EOPNOTSUPP; + + if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto err; + } + + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL, true); + if (!counter) { + ret = -ENOMEM; + goto err; + } + + if (counter_id) + *counter_id = counter->id; + + rdma_restrack_put(&qp->res); + return 0; + +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/* + * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter + */ +int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_port_counter *port_counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto out; + } + + port_counter = &dev->port_data[port].port_counter; + if (!qp->counter || qp->counter->id != counter_id || + port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { + ret = -EINVAL; + goto out; + } + + ret = rdma_counter_unbind_qp(qp, port, false); + +out: + rdma_restrack_put(&qp->res); + return ret; +} + +int rdma_counter_get_mode(struct ib_device *dev, u32 port, + enum rdma_nl_counter_mode *mode, + enum rdma_nl_counter_mask *mask, + bool *opcnt) +{ + struct rdma_port_counter *port_counter; + + port_counter = &dev->port_data[port].port_counter; + *mode = port_counter->mode.mode; + *mask = port_counter->mode.mask; + *opcnt = port_counter->mode.bind_opcnt; + + return 0; +} + +void rdma_counter_init(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port, i; + + if (!dev->port_data) + return; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; + mutex_init(&port_counter->lock); + + if (!dev->ops.alloc_hw_port_stats) + continue; + + port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); + if (!port_counter->hstats) + goto fail; + } + + return; + +fail: + for (i = port; i >= rdma_start_port(dev); i--) { + port_counter = &dev->port_data[port].port_counter; + rdma_free_hw_stats_struct(port_counter->hstats); + port_counter->hstats = NULL; + mutex_destroy(&port_counter->lock); + } +} + +void rdma_counter_release(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + rdma_free_hw_stats_struct(port_counter->hstats); + mutex_destroy(&port_counter->lock); + } +} diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index d61e5e1427c2..584537c71545 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -1,20 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ -#include <linux/module.h> #include <linux/err.h> #include <linux/slab.h> #include <rdma/ib_verbs.h> +#include "core_priv.h" + +#include <trace/events/rdma_core.h> +/* Max size for shared CQ, may require tuning */ +#define IB_MAX_SHARED_CQ_SZ 4096U + /* # of WCs to poll for with a single call to ib_poll_cq */ #define IB_POLL_BATCH 16 #define IB_POLL_BATCH_DIRECT 8 @@ -26,18 +23,86 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) +static const struct dim_cq_moder +rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 1, 0}, + {1, 0, 4, 0}, + {2, 0, 4, 0}, + {2, 0, 8, 0}, + {4, 0, 8, 0}, + {16, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 16, 0}, + {32, 0, 32, 0}, +}; + +static void ib_cq_rdma_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = dim->priv; + + u16 usec = rdma_dim_prof[dim->profile_ix].usec; + u16 comps = rdma_dim_prof[dim->profile_ix].comps; + + dim->state = DIM_START_MEASURE; + + trace_cq_modify(cq, comps, usec); + cq->device->ops.modify_cq(cq, comps, usec); +} + +static void rdma_dim_init(struct ib_cq *cq) +{ + struct dim *dim; + + if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || + cq->poll_ctx == IB_POLL_DIRECT) + return; + + dim = kzalloc(sizeof(struct dim), GFP_KERNEL); + if (!dim) + return; + + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = RDMA_DIM_START_PROFILE; + dim->priv = cq; + cq->dim = dim; + + INIT_WORK(&dim->work, ib_cq_rdma_dim_work); +} + +static void rdma_dim_destroy(struct ib_cq *cq) +{ + if (!cq->dim) + return; + + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); +} + +static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) +{ + int rc; + + rc = ib_poll_cq(cq, num_entries, wc); + trace_cq_poll(cq, num_entries, rc); + return rc; +} + static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, int batch) { int i, n, completed = 0; + trace_cq_process(cq); + /* * budget might be (-1) if the caller does not * want to bound this call, thus we need unsigned * minimum here. */ - while ((n = ib_poll_cq(cq, min_t(u32, batch, - budget - completed), wcs)) > 0) { + while ((n = __poll_cq(cq, min_t(u32, batch, + budget - completed), wcs)) > 0) { for (i = 0; i < n; i++) { struct ib_wc *wc = &wcs[i]; @@ -57,7 +122,7 @@ static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, } /** - * ib_process_direct_cq - process a CQ in caller context + * ib_process_cq_direct - process a CQ in caller context * @cq: CQ to process * @budget: number of CQEs to poll for * @@ -86,20 +151,27 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private) static int ib_poll_handler(struct irq_poll *iop, int budget) { struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + struct dim *dim = cq->dim; int completed; completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); if (completed < budget) { irq_poll_complete(&cq->iop); - if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { + trace_cq_reschedule(cq); irq_poll_sched(&cq->iop); + } } + if (dim) + rdma_dim(dim, completed); + return completed; } static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) { + trace_cq_schedule(cq); irq_poll_sched(&cq->iop); } @@ -113,10 +185,13 @@ static void ib_cq_poll_work(struct work_struct *work) if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(cq->comp_wq, &cq->work); + else if (cq->dim) + rdma_dim(cq->dim, completed); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { + trace_cq_schedule(cq); queue_work(cq->comp_wq, &cq->work); } @@ -134,9 +209,9 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ -struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, - enum ib_poll_context poll_ctx, const char *caller) +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, + int comp_vector, enum ib_poll_context poll_ctx, + const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, @@ -145,24 +220,28 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, struct ib_cq *cq; int ret = -ENOMEM; - cq = dev->ops.create_cq(dev, &cq_attr, NULL, NULL); - if (IS_ERR(cq)) - return cq; + cq = rdma_zalloc_drv_obj(dev, ib_cq); + if (!cq) + return ERR_PTR(ret); cq->device = dev; - cq->uobject = NULL; - cq->event_handler = NULL; cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); + cq->comp_vector = comp_vector; cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) - goto out_destroy_cq; + goto out_free_cq; + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); - rdma_restrack_kadd(&cq->res); + ret = dev->ops.create_cq(cq, &cq_attr, NULL); + if (ret) + goto out_free_wc; + + rdma_dim_init(cq); switch (cq->poll_ctx) { case IB_POLL_DIRECT: @@ -184,30 +263,71 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, break; default: ret = -EINVAL; - goto out_free_wc; + goto out_destroy_cq; } + rdma_restrack_add(&cq->res); + trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); return cq; +out_destroy_cq: + rdma_dim_destroy(cq); + cq->device->ops.destroy_cq(cq, NULL); out_free_wc: + rdma_restrack_put(&cq->res); kfree(cq->wc); - rdma_restrack_del(&cq->res); -out_destroy_cq: - cq->device->ops.destroy_cq(cq); +out_free_cq: + kfree(cq); + trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); return ERR_PTR(ret); } EXPORT_SYMBOL(__ib_alloc_cq); /** + * __ib_alloc_cq_any - allocate a completion queue + * @dev: device to allocate the CQ for + * @private: driver private data, accessible from cq->cq_context + * @nr_cqe: number of CQEs to allocate + * @poll_ctx: context to poll the CQ from + * @caller: module owner name + * + * Attempt to spread ULP Completion Queues over each device's interrupt + * vectors. A simple best-effort mechanism is used. + */ +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, + int nr_cqe, enum ib_poll_context poll_ctx, + const char *caller) +{ + static atomic_t counter; + int comp_vector = 0; + + if (dev->num_comp_vectors > 1) + comp_vector = + atomic_inc_return(&counter) % + min_t(int, dev->num_comp_vectors, num_online_cpus()); + + return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, + caller); +} +EXPORT_SYMBOL(__ib_alloc_cq_any); + +/** * ib_free_cq - free a completion queue * @cq: completion queue to free. */ void ib_free_cq(struct ib_cq *cq) { - int ret; + int ret = 0; if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; + if (WARN_ON_ONCE(cq->cqe_used)) + return; + + if (cq->device->ops.pre_destroy_cq) { + ret = cq->device->ops.pre_destroy_cq(cq); + WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail"); + } switch (cq->poll_ctx) { case IB_POLL_DIRECT: @@ -223,9 +343,173 @@ void ib_free_cq(struct ib_cq *cq) WARN_ON_ONCE(1); } - kfree(cq->wc); + rdma_dim_destroy(cq); + trace_cq_free(cq); + if (cq->device->ops.post_destroy_cq) + cq->device->ops.post_destroy_cq(cq); + else + ret = cq->device->ops.destroy_cq(cq, NULL); + WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); rdma_restrack_del(&cq->res); - ret = cq->device->ops.destroy_cq(cq); - WARN_ON_ONCE(ret); + kfree(cq->wc); + kfree(cq); } EXPORT_SYMBOL(ib_free_cq); + +void ib_cq_pool_cleanup(struct ib_device *dev) +{ + struct ib_cq *cq, *n; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { + list_for_each_entry_safe(cq, n, &dev->cq_pools[i], + pool_entry) { + WARN_ON(cq->cqe_used); + list_del(&cq->pool_entry); + cq->shared = false; + ib_free_cq(cq); + } + } +} + +static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, + enum ib_poll_context poll_ctx) +{ + LIST_HEAD(tmp_list); + unsigned int nr_cqs, i; + struct ib_cq *cq, *n; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return -EINVAL; + } + + /* + * Allocate at least as many CQEs as requested, and otherwise + * a reasonable batch size so that we can share CQs between + * multiple users instead of allocating a larger number of CQs. + */ + nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, + max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); + nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + for (i = 0; i < nr_cqs; i++) { + cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto out_free_cqs; + } + cq->shared = true; + list_add_tail(&cq->pool_entry, &tmp_list); + } + + spin_lock_irq(&dev->cq_pools_lock); + list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); + spin_unlock_irq(&dev->cq_pools_lock); + + return 0; + +out_free_cqs: + list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) { + cq->shared = false; + ib_free_cq(cq); + } + return ret; +} + +/** + * ib_cq_pool_get() - Find the least used completion queue that matches + * a given cpu hint (or least used for wild card affinity) and fits + * nr_cqe. + * @dev: rdma device + * @nr_cqe: number of needed cqe entries + * @comp_vector_hint: completion vector hint (-1) for the driver to assign + * a comp vector based on internal counter + * @poll_ctx: cq polling context + * + * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and + * claim entries in it for us. In case there is no available cq, allocate + * a new cq with the requirements and add it to the device pool. + * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value + * for @poll_ctx. + */ +struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, + int comp_vector_hint, + enum ib_poll_context poll_ctx) +{ + static unsigned int default_comp_vector; + unsigned int vector, num_comp_vectors; + struct ib_cq *cq, *found = NULL; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return ERR_PTR(-EINVAL); + } + + num_comp_vectors = + min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + /* Project the affinty to the device completion vector range */ + if (comp_vector_hint < 0) { + comp_vector_hint = + (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; + WRITE_ONCE(default_comp_vector, comp_vector_hint); + } + vector = comp_vector_hint % num_comp_vectors; + + /* + * Find the least used CQ with correct affinity and + * enough free CQ entries + */ + while (!found) { + spin_lock_irq(&dev->cq_pools_lock); + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], + pool_entry) { + /* + * Check to see if we have found a CQ with the + * correct completion vector + */ + if (vector != cq->comp_vector) + continue; + if (cq->cqe_used + nr_cqe > cq->cqe) + continue; + found = cq; + break; + } + + if (found) { + found->cqe_used += nr_cqe; + spin_unlock_irq(&dev->cq_pools_lock); + + return found; + } + spin_unlock_irq(&dev->cq_pools_lock); + + /* + * Didn't find a match or ran out of CQs in the device + * pool, allocate a new array of CQs. + */ + ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); + if (ret) + return ERR_PTR(ret); + } + + return found; +} +EXPORT_SYMBOL(ib_cq_pool_get); + +/** + * ib_cq_pool_put - Return a CQ taken from a shared pool. + * @cq: The CQ to return. + * @nr_cqe: The max number of cqes that the user had requested. + */ +void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) +{ + if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) + return; + + spin_lock_irq(&cq->device->cq_pools_lock); + cq->cqe_used -= nr_cqe; + spin_unlock_irq(&cq->device->cq_pools_lock); +} +EXPORT_SYMBOL(ib_cq_pool_put); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8872453e26c0..13e8a1714bbd 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -37,64 +37,244 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> -#include <linux/mutex.h> #include <linux/netdevice.h> +#include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> +#include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> +#include <rdma/rdma_counter.h> #include "core_priv.h" +#include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); -struct ib_client_data { - struct list_head list; - struct ib_client *client; - void * data; - /* The device or client is going down. Do not call client or device - * callbacks other than remove(). */ - bool going_down; -}; - struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +static struct workqueue_struct *ib_unreg_wq; + +/* + * Each of the three rwsem locks (devices, clients, client_data) protects the + * xarray of the same name. Specifically it allows the caller to assert that + * the MARK will/will not be changing under the lock, and for devices and + * clients, that the value in the xarray is still a valid pointer. Change of + * the MARK is linked to the object state, so holding the lock and testing the + * MARK also asserts that the contained object is in a certain state. + * + * This is used to build a two stage register/unregister flow where objects + * can continue to be in the xarray even though they are still in progress to + * register/unregister. + * + * The xarray itself provides additional locking, and restartable iteration, + * which is also relied on. + * + * Locks should not be nested, with the exception of client_data, which is + * allowed to nest under the read side of the other two locks. + * + * The devices_rwsem also protects the device name list, any change or + * assignment of device name must also hold the write side to guarantee unique + * names. + */ + +/* + * devices contains devices that have had their names assigned. The + * devices may not be registered. Users that care about the registration + * status need to call ib_device_try_get() on the device to ensure it is + * registered, and keep it registered, for the required duration. + * + */ +static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(devices_rwsem); +#define DEVICE_REGISTERED XA_MARK_1 -/* The device_list and client_list contain devices and clients after their - * registration has completed, and the devices and clients are removed - * during unregistration. */ -static LIST_HEAD(device_list); -static LIST_HEAD(client_list); +static u32 highest_client_id; +#define CLIENT_REGISTERED XA_MARK_1 +static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(clients_rwsem); + +static void ib_client_put(struct ib_client *client) +{ + if (refcount_dec_and_test(&client->uses)) + complete(&client->uses_zero); +} + +/* + * If client_data is registered then the corresponding client must also still + * be registered. + */ +#define CLIENT_DATA_REGISTERED XA_MARK_1 + +unsigned int rdma_dev_net_id; /* - * device_mutex and lists_rwsem protect access to both device_list and - * client_list. device_mutex protects writer access by device and client - * registration / de-registration. lists_rwsem protects reader access to - * these lists. Iterators of these lists must lock it for read, while updates - * to the lists must be done with a write lock. A special case is when the - * device_mutex is locked. In this case locking the lists for read access is - * not necessary as the device_mutex implies it. + * A list of net namespaces is maintained in an xarray. This is necessary + * because we can't get the locking right using the existing net ns list. We + * would require a init_net callback after the list is updated. + */ +static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); +/* + * rwsem to protect accessing the rdma_nets xarray entries. + */ +static DECLARE_RWSEM(rdma_nets_rwsem); + +bool ib_devices_shared_netns = true; +module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); +MODULE_PARM_DESC(netns_mode, + "Share device among net namespaces; default=1 (shared)"); +/** + * rdma_dev_access_netns() - Return whether an rdma device can be accessed + * from a specified net namespace or not. + * @dev: Pointer to rdma device which needs to be checked + * @net: Pointer to net namesapce for which access to be checked * - * lists_rwsem also protects access to the client data list. + * When the rdma device is in shared mode, it ignores the net namespace. + * When the rdma device is exclusive to a net namespace, rdma device net + * namespace is checked against the specified one. */ -static DEFINE_MUTEX(device_mutex); -static DECLARE_RWSEM(lists_rwsem); +bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) +{ + return (ib_devices_shared_netns || + net_eq(read_pnet(&dev->coredev.rdma_net), net)); +} +EXPORT_SYMBOL(rdma_dev_access_netns); +/** + * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has + * CAP_NET_RAW capability or not. + * + * @dev: Pointer to rdma device whose capability to be checked + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. When rdma subsystem is in legacy shared network, + * namespace mode, the default net namespace is considered. + */ +bool rdma_dev_has_raw_cap(const struct ib_device *dev) +{ + const struct net *net; + + /* Network namespace is the resource whose user namespace + * to be considered. When in shared mode, there is no reliable + * network namespace resource, so consider the default net namespace. + */ + if (ib_devices_shared_netns) + net = &init_net; + else + net = read_pnet(&dev->coredev.rdma_net); + + return ns_capable(net->user_ns, CAP_NET_RAW); +} +EXPORT_SYMBOL(rdma_dev_has_raw_cap); + +/* + * xarray has this behavior where it won't iterate over NULL values stored in + * allocated arrays. So we need our own iterator to see all values stored in + * the array. This does the same thing as xa_for_each except that it also + * returns NULL valued entries if the array is allocating. Simplified to only + * work on simple xarrays. + */ +static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, + xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, filter); + if (xa_is_zero(entry)) + break; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) { + *indexp = xas.xa_index; + if (xa_is_zero(entry)) + return NULL; + return entry; + } + return XA_ERROR(-ENOENT); +} +#define xan_for_each_marked(xa, index, entry, filter) \ + for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ + !xa_is_err(entry); \ + (index)++, entry = xan_find_marked(xa, &(index), filter)) + +/* RCU hash table mapping netdevice pointers to struct ib_port_data */ +static DEFINE_SPINLOCK(ndev_hash_lock); +static DECLARE_HASHTABLE(ndev_hash, 5); + +static void free_netdevs(struct ib_device *ib_dev); +static void ib_unregister_work(struct work_struct *work); +static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); +static void __ibdev_printk(const char *level, const struct ib_device *ibdev, + struct va_format *vaf) +{ + if (ibdev && ibdev->dev.parent) + dev_printk_emit(level[1] - '0', + ibdev->dev.parent, + "%s %s %s: %pV", + dev_driver_string(ibdev->dev.parent), + dev_name(ibdev->dev.parent), + dev_name(&ibdev->dev), + vaf); + else if (ibdev) + printk("%s%s: %pV", + level, dev_name(&ibdev->dev), vaf); + else + printk("%s(NULL ib_device): %pV", level, vaf); +} + +#define define_ibdev_printk_level(func, level) \ +void func(const struct ib_device *ibdev, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __ibdev_printk(level, ibdev, &vaf); \ + \ + va_end(args); \ +} \ +EXPORT_SYMBOL(func); + +define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); +define_ibdev_printk_level(ibdev_alert, KERN_ALERT); +define_ibdev_printk_level(ibdev_crit, KERN_CRIT); +define_ibdev_printk_level(ibdev_err, KERN_ERR); +define_ibdev_printk_level(ibdev_warn, KERN_WARNING); +define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); +define_ibdev_printk_level(ibdev_info, KERN_INFO); + static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; -static int ib_device_check_mandatory(struct ib_device *device) +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net); + +/* Pointer to the RCU head at the start of the ib_port_data array */ +struct ib_port_data_rcu { + struct rcu_head rcu_head; + struct ib_port_data pdata[]; +}; + +static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { @@ -103,7 +283,6 @@ static int ib_device_check_mandatory(struct ib_device *device) } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), - IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), @@ -116,142 +295,235 @@ static int ib_device_check_mandatory(struct ib_device *device) IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), + IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; + device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { - dev_warn(&device->dev, - "Device is missing mandatory function %s\n", - mandatory_table[i].name); - return -EINVAL; + device->kverbs_provider = false; + break; } } - - return 0; -} - -static struct ib_device *__ib_device_get_by_index(u32 index) -{ - struct ib_device *device; - - list_for_each_entry(device, &device_list, core_list) - if (device->index == index) - return device; - - return NULL; } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ -struct ib_device *ib_device_get_by_index(u32 index) +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; - down_read(&lists_rwsem); - device = __ib_device_get_by_index(index); + down_read(&devices_rwsem); + device = xa_load(&devices, index); if (device) { - /* Do not return a device if unregistration has started. */ - if (!refcount_inc_not_zero(&device->refcount)) + if (!rdma_dev_access_netns(device, net)) { + device = NULL; + goto out; + } + + if (!ib_device_try_get(device)) device = NULL; } - up_read(&lists_rwsem); +out: + up_read(&devices_rwsem); return device; } +/** + * ib_device_put - Release IB device reference + * @device: device whose reference to be released + * + * ib_device_put() releases reference to the IB device to allow it to be + * unregistered and eventually free. + */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } +EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; + unsigned long index; - list_for_each_entry(device, &device_list, core_list) + xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } -int ib_device_rename(struct ib_device *ibdev, const char *name) +/** + * ib_device_get_by_name - Find an IB device by name + * @name: The name to look for + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device by its name. The caller must call + * ib_device_put() on the returned pointer. + */ +struct ib_device *ib_device_get_by_name(const char *name, + enum rdma_driver_id driver_id) { struct ib_device *device; - int ret = 0; - if (!strcmp(name, dev_name(&ibdev->dev))) - return ret; + down_read(&devices_rwsem); + device = __ib_device_get_by_name(name); + if (device && driver_id != RDMA_DRIVER_UNKNOWN && + device->ops.driver_id != driver_id) + device = NULL; - mutex_lock(&device_mutex); - list_for_each_entry(device, &device_list, core_list) { - if (!strcmp(name, dev_name(&device->dev))) { - ret = -EEXIST; - goto out; + if (device) { + if (!ib_device_try_get(device)) + device = NULL; + } + up_read(&devices_rwsem); + return device; +} +EXPORT_SYMBOL(ib_device_get_by_name); + +static int rename_compat_devs(struct ib_device *device) +{ + struct ib_core_device *cdev; + unsigned long index; + int ret = 0; + + mutex_lock(&device->compat_devs_mutex); + xa_for_each (&device->compat_devs, index, cdev) { + ret = device_rename(&cdev->dev, dev_name(&device->dev)); + if (ret) { + dev_warn(&cdev->dev, + "Fail to rename compatdev to new name %s\n", + dev_name(&device->dev)); + break; } } + mutex_unlock(&device->compat_devs_mutex); + return ret; +} + +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + unsigned long index; + void *client_data; + int ret; + + down_write(&devices_rwsem); + if (!strcmp(name, dev_name(&ibdev->dev))) { + up_write(&devices_rwsem); + return 0; + } + + if (__ib_device_get_by_name(name)) { + up_write(&devices_rwsem); + return -EEXIST; + } ret = device_rename(&ibdev->dev, name); - if (ret) - goto out; - strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); -out: - mutex_unlock(&device_mutex); - return ret; + if (ret) { + up_write(&devices_rwsem); + return ret; + } + + strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); + ret = rename_compat_devs(ibdev); + + downgrade_write(&devices_rwsem); + down_read(&ibdev->client_data_rwsem); + xan_for_each_marked(&ibdev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || !client->rename) + continue; + + client->rename(ibdev, client_data); + } + up_read(&ibdev->client_data_rwsem); + rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); + up_read(&devices_rwsem); + return 0; +} + +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) +{ + if (use_dim > 1) + return -EINVAL; + ibdev->use_cq_dim = use_dim; + + return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { - unsigned long *inuse; struct ib_device *device; + unsigned long index; + struct ida inuse; + int rc; int i; - inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); - if (!inuse) - return -ENOMEM; - - list_for_each_entry(device, &device_list, core_list) { + lockdep_assert_held_write(&devices_rwsem); + ida_init(&inuse); + xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; - if (i < 0 || i >= PAGE_SIZE * 8) + if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); - if (!strcmp(buf, dev_name(&device->dev))) - set_bit(i, inuse); + if (strcmp(buf, dev_name(&device->dev)) != 0) + continue; + + rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); + if (rc < 0) + goto out; } - i = find_first_zero_bit(inuse, PAGE_SIZE * 8); - free_page((unsigned long) inuse); + rc = ida_alloc(&inuse, GFP_KERNEL); + if (rc < 0) + goto out; - return dev_set_name(&ibdev->dev, name, i); + rc = dev_set_name(&ibdev->dev, name, rc); +out: + ida_destroy(&inuse); + return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); - WARN_ON(dev->reg_state == IB_DEV_REGISTERED); - if (dev->reg_state == IB_DEV_UNREGISTERED) { - /* - * In IB_DEV_UNINITIALIZED state, cache or port table - * is not even created. Free cache and port table only when - * device reaches UNREGISTERED state. - */ + free_netdevs(dev); + WARN_ON(refcount_read(&dev->refcount)); + if (dev->hw_stats_data) + ib_device_release_hw_stats(dev->hw_stats_data); + if (dev->port_data) { ib_cache_release_one(dev); - kfree(dev->port_immutable); + ib_security_release_port_pkey_list(dev); + rdma_counter_release(dev); + kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, + pdata[0]), + rcu_head); } - kfree(dev); + + mutex_destroy(&dev->subdev_lock); + mutex_destroy(&dev->unregistration_lock); + mutex_destroy(&dev->compat_devs_mutex); + + xa_destroy(&dev->compat_devs); + xa_destroy(&dev->client_data); + kfree_rcu(dev, rcu_head); } -static int ib_device_uevent(struct device *device, +static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) @@ -264,15 +536,56 @@ static int ib_device_uevent(struct device *device, return 0; } +static const void *net_namespace(const struct device *d) +{ + const struct ib_core_device *coredev = + container_of(d, struct ib_core_device, dev); + + return read_pnet(&coredev->rdma_net); +} + static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, }; +static void rdma_init_coredev(struct ib_core_device *coredev, + struct ib_device *dev, struct net *net) +{ + bool is_full_dev = &dev->coredev == coredev; + + /* This BUILD_BUG_ON is intended to catch layout change + * of union of ib_core_device and device. + * dev must be the first element as ib_core and providers + * driver uses it. Adding anything in ib_core_device before + * device will break this assumption. + */ + BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != + offsetof(struct ib_device, dev)); + + coredev->dev.class = &ib_class; + coredev->dev.groups = dev->groups; + + /* + * Don't expose hw counters outside of the init namespace. + */ + if (!is_full_dev && dev->hw_stats_attr_index) + coredev->dev.groups[dev->hw_stats_attr_index] = NULL; + + device_initialize(&coredev->dev); + coredev->owner = dev; + INIT_LIST_HEAD(&coredev->port_list); + write_pnet(&coredev->rdma_net, net); +} + /** - * ib_alloc_device - allocate an IB device struct + * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate + * @net: network namespace device should be located in, namespace + * must stay valid until ib_register_device() is completed. * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, @@ -280,9 +593,10 @@ static struct class ib_class = { * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ -struct ib_device *ib_alloc_device(size_t size) +struct ib_device *_ib_alloc_device(size_t size, struct net *net) { struct ib_device *device; + unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; @@ -291,24 +605,81 @@ struct ib_device *ib_alloc_device(size_t size) if (!device) return NULL; - rdma_restrack_init(&device->res); - - device->dev.class = &ib_class; - device_initialize(&device->dev); + if (rdma_restrack_init(device)) { + kfree(device); + return NULL; + } - dev_set_drvdata(&device->dev, device); + /* ib_devices_shared_netns can't change while we have active namespaces + * in the system which means either init_net is passed or the user has + * no idea what they are doing. + * + * To avoid breaking backward compatibility, when in shared mode, + * force to init the device in the init_net. + */ + net = ib_devices_shared_netns ? &init_net : net; + rdma_init_coredev(&device->coredev, device, net); INIT_LIST_HEAD(&device->event_handler_list); - spin_lock_init(&device->event_handler_lock); - rwlock_init(&device->client_data_lock); - INIT_LIST_HEAD(&device->client_data_list); - INIT_LIST_HEAD(&device->port_list); - refcount_set(&device->refcount, 1); + spin_lock_init(&device->qp_open_list_lock); + init_rwsem(&device->event_handler_rwsem); + mutex_init(&device->unregistration_lock); + /* + * client_data needs to be alloc because we don't want our mark to be + * destroyed if the user stores NULL in the client data. + */ + xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); + init_rwsem(&device->client_data_rwsem); + xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); + mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); + INIT_WORK(&device->unregistration_work, ib_unregister_work); + + spin_lock_init(&device->cq_pools_lock); + for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) + INIT_LIST_HEAD(&device->cq_pools[i]); + + rwlock_init(&device->cache_lock); + + device->uverbs_cmd_mask = + BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | + BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | + BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | + BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | + BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | + BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | + BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | + BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | + BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | + BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); + + mutex_init(&device->subdev_lock); + INIT_LIST_HEAD(&device->subdev_list_head); + INIT_LIST_HEAD(&device->subdev_list); return device; } -EXPORT_SYMBOL(ib_alloc_device); +EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct @@ -318,64 +689,200 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { - WARN_ON(!list_empty(&device->client_data_list)); - WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && - device->reg_state != IB_DEV_UNINITIALIZED); - rdma_restrack_clean(&device->res); + if (device->ops.dealloc_driver) + device->ops.dealloc_driver(device); + + /* + * ib_unregister_driver() requires all devices to remain in the xarray + * while their ops are callable. The last op we call is dealloc_driver + * above. This is needed to create a fence on op callbacks prior to + * allowing the driver module to unload. + */ + down_write(&devices_rwsem); + if (xa_load(&devices, device->index) == device) + xa_erase(&devices, device->index); + up_write(&devices_rwsem); + + /* Expedite releasing netdev references */ + free_netdevs(device); + + WARN_ON(!xa_empty(&device->compat_devs)); + WARN_ON(!xa_empty(&device->client_data)); + WARN_ON(refcount_read(&device->refcount)); + rdma_restrack_clean(device); + /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); -static int add_client_context(struct ib_device *device, struct ib_client *client) +/* + * add_client_context() and remove_client_context() must be safe against + * parallel calls on the same device - registration/unregistration of both the + * device and client can be occurring in parallel. + * + * The routines need to be a fence, any caller must not return until the add + * or remove is fully completed. + */ +static int add_client_context(struct ib_device *device, + struct ib_client *client) { - struct ib_client_data *context; + int ret = 0; - context = kmalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return -ENOMEM; + if (!device->kverbs_provider && !client->no_kverbs_req) + return 0; + + down_write(&device->client_data_rwsem); + /* + * So long as the client is registered hold both the client and device + * unregistration locks. + */ + if (!refcount_inc_not_zero(&client->uses)) + goto out_unlock; + refcount_inc(&device->refcount); - context->client = client; - context->data = NULL; - context->going_down = false; + /* + * Another caller to add_client_context got here first and has already + * completely initialized context. + */ + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED)) + goto out; - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_add(&context->list, &device->client_data_list); - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); + ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL)); + if (ret) + goto out; + downgrade_write(&device->client_data_rwsem); + if (client->add) { + if (client->add(device)) { + /* + * If a client fails to add then the error code is + * ignored, but we won't call any more ops on this + * client. + */ + xa_erase(&device->client_data, client->client_id); + up_read(&device->client_data_rwsem); + ib_device_put(device); + ib_client_put(client); + return 0; + } + } + /* Readers shall not see a client until add has been completed */ + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); + up_read(&device->client_data_rwsem); return 0; + +out: + ib_device_put(device); + ib_client_put(client); +out_unlock: + up_write(&device->client_data_rwsem); + return ret; } -static int verify_immutable(const struct ib_device *dev, u8 port) +static void remove_client_context(struct ib_device *device, + unsigned int client_id) { - return WARN_ON(!rdma_cap_ib_mad(dev, port) && - rdma_max_mad_size(dev, port) != 0); + struct ib_client *client; + void *client_data; + + down_write(&device->client_data_rwsem); + if (!xa_get_mark(&device->client_data, client_id, + CLIENT_DATA_REGISTERED)) { + up_write(&device->client_data_rwsem); + return; + } + client_data = xa_load(&device->client_data, client_id); + xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); + client = xa_load(&clients, client_id); + up_write(&device->client_data_rwsem); + + /* + * Notice we cannot be holding any exclusive locks when calling the + * remove callback as the remove callback can recurse back into any + * public functions in this module and thus try for any locks those + * functions take. + * + * For this reason clients and drivers should not call the + * unregistration functions will holdling any locks. + */ + if (client->remove) + client->remove(device, client_data); + + xa_erase(&device->client_data, client_id); + ib_device_put(device); + ib_client_put(client); } -static int read_port_immutable(struct ib_device *device) +static int alloc_port_data(struct ib_device *device) { - int ret; - u8 start_port = rdma_start_port(device); - u8 end_port = rdma_end_port(device); - u8 port; + struct ib_port_data_rcu *pdata_rcu; + u32 port; - /** - * device->port_immutable is indexed directly by the port number to make + if (device->port_data) + return 0; + + /* This can only be called once the physical port range is defined */ + if (WARN_ON(!device->phys_port_cnt)) + return -EINVAL; + + /* Reserve U32_MAX so the logic to go over all the ports is sane */ + if (WARN_ON(device->phys_port_cnt == U32_MAX)) + return -EINVAL; + + /* + * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * - * Therefore port_immutable is declared as a 1 based array with - * potential empty slots at the beginning. + * Therefore port_data is declared as a 1 based array with potential + * empty slots at the beginning. */ - device->port_immutable = kcalloc(end_port + 1, - sizeof(*device->port_immutable), - GFP_KERNEL); - if (!device->port_immutable) + pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, + size_add(rdma_end_port(device), 1)), + GFP_KERNEL); + if (!pdata_rcu) return -ENOMEM; + /* + * The rcu_head is put in front of the port data array and the stored + * pointer is adjusted since we never need to see that member until + * kfree_rcu. + */ + device->port_data = pdata_rcu->pdata; + + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + + pdata->ib_dev = device; + spin_lock_init(&pdata->pkey_list_lock); + INIT_LIST_HEAD(&pdata->pkey_list); + spin_lock_init(&pdata->netdev_lock); + INIT_HLIST_NODE(&pdata->ndev_hash_link); + } + return 0; +} - for (port = start_port; port <= end_port; ++port) { - ret = device->ops.get_port_immutable( - device, port, &device->port_immutable[port]); +static int verify_immutable(const struct ib_device *dev, u32 port) +{ + return WARN_ON(!rdma_cap_ib_mad(dev, port) && + rdma_max_mad_size(dev, port) != 0); +} + +static int setup_port_data(struct ib_device *device) +{ + u32 port; + int ret; + + ret = alloc_port_data(device); + if (ret) + return ret; + + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + + ret = device->ops.get_port_immutable(device, port, + &pdata->immutable); if (ret) return ret; @@ -385,6 +892,20 @@ static int read_port_immutable(struct ib_device *device) return 0; } +/** + * ib_port_immutable_read() - Read rdma port's immutable data + * @dev: IB device + * @port: port number whose immutable data to read. It starts with index 1 and + * valid upto including rdma_end_port(). + */ +const struct ib_port_immutable* +ib_port_immutable_read(struct ib_device *dev, unsigned int port) +{ + WARN_ON(!rdma_is_port_valid(dev, port)); + return &dev->port_data[port].immutable; +} +EXPORT_SYMBOL(ib_port_immutable_read); + void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) @@ -394,52 +915,22 @@ void ib_get_device_fw_str(struct ib_device *dev, char *str) } EXPORT_SYMBOL(ib_get_device_fw_str); -static int setup_port_pkey_list(struct ib_device *device) -{ - int i; - - /** - * device->port_pkey_list is indexed directly by the port number, - * Therefore it is declared as a 1 based array with potential empty - * slots at the beginning. - */ - device->port_pkey_list = kcalloc(rdma_end_port(device) + 1, - sizeof(*device->port_pkey_list), - GFP_KERNEL); - - if (!device->port_pkey_list) - return -ENOMEM; - - for (i = 0; i < (rdma_end_port(device) + 1); i++) { - spin_lock_init(&device->port_pkey_list[i].list_lock); - INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list); - } - - return 0; -} - static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; + unsigned long index; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { - int i; + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + unsigned int i; - for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) { + rdma_for_each_port (dev, i) { u64 sp; - int ret = ib_get_cached_subnet_prefix(dev, - i, - &sp); - - WARN_ONCE(ret, - "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", - ret); - if (!ret) - ib_security_cache_change(dev, i, sp); + ib_get_cached_subnet_prefix(dev, i, &sp); + ib_security_cache_change(dev, i, sp); } } - up_read(&lists_rwsem); + up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, @@ -449,90 +940,346 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; schedule_work(&ib_policy_change_work); + ib_mad_agent_security_change(); return NOTIFY_OK; } -/** - * __dev_new_index - allocate an device index - * - * Returns a suitable unique value for a new device interface - * number. It assumes that there are less than 2^32-1 ib devices - * will be present in the system. - */ -static u32 __dev_new_index(void) +static void compatdev_release(struct device *dev) +{ + struct ib_core_device *cdev = + container_of(dev, struct ib_core_device, dev); + + kfree(cdev); +} + +static int add_one_compat_dev(struct ib_device *device, + struct rdma_dev_net *rnet) { + struct ib_core_device *cdev; + int ret; + + lockdep_assert_held(&rdma_nets_rwsem); + if (!ib_devices_shared_netns) + return 0; + /* - * The device index to allow stable naming. - * Similar to struct net -> ifindex. + * Create and add compat device in all namespaces other than where it + * is currently bound to. */ - static u32 index; + if (net_eq(read_pnet(&rnet->net), + read_pnet(&device->coredev.rdma_net))) + return 0; - for (;;) { - if (!(++index)) - index = 1; + /* + * The first of init_net() or ib_register_device() to take the + * compat_devs_mutex wins and gets to add the device. Others will wait + * for completion here. + */ + mutex_lock(&device->compat_devs_mutex); + cdev = xa_load(&device->compat_devs, rnet->id); + if (cdev) { + ret = 0; + goto done; + } + ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); + if (ret) + goto done; - if (!__ib_device_get_by_index(index)) - return index; + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) { + ret = -ENOMEM; + goto cdev_err; } + + cdev->dev.parent = device->dev.parent; + rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); + cdev->dev.release = compatdev_release; + ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); + if (ret) + goto add_err; + + ret = device_add(&cdev->dev); + if (ret) + goto add_err; + ret = ib_setup_port_attrs(cdev); + if (ret) + goto port_err; + + ret = xa_err(xa_store(&device->compat_devs, rnet->id, + cdev, GFP_KERNEL)); + if (ret) + goto insert_err; + + mutex_unlock(&device->compat_devs_mutex); + return 0; + +insert_err: + ib_free_port_attrs(cdev); +port_err: + device_del(&cdev->dev); +add_err: + put_device(&cdev->dev); +cdev_err: + xa_release(&device->compat_devs, rnet->id); +done: + mutex_unlock(&device->compat_devs_mutex); + return ret; } -static void setup_dma_device(struct ib_device *device) +static void remove_one_compat_dev(struct ib_device *device, u32 id) { - struct device *parent = device->dev.parent; + struct ib_core_device *cdev; + + mutex_lock(&device->compat_devs_mutex); + cdev = xa_erase(&device->compat_devs, id); + mutex_unlock(&device->compat_devs_mutex); + if (cdev) { + ib_free_port_attrs(cdev); + device_del(&cdev->dev); + put_device(&cdev->dev); + } +} - WARN_ON_ONCE(device->dma_device); - if (device->dev.dma_ops) { - /* - * The caller provided custom DMA operations. Copy the - * DMA-related fields that are used by e.g. dma_alloc_coherent() - * into device->dev. +static void remove_compat_devs(struct ib_device *device) +{ + struct ib_core_device *cdev; + unsigned long index; + + xa_for_each (&device->compat_devs, index, cdev) + remove_one_compat_dev(device, index); +} + +static int add_compat_devs(struct ib_device *device) +{ + struct rdma_dev_net *rnet; + unsigned long index; + int ret = 0; + + lockdep_assert_held(&devices_rwsem); + + down_read(&rdma_nets_rwsem); + xa_for_each (&rdma_nets, index, rnet) { + ret = add_one_compat_dev(device, rnet); + if (ret) + break; + } + up_read(&rdma_nets_rwsem); + return ret; +} + +static void remove_all_compat_devs(void) +{ + struct ib_compat_device *cdev; + struct ib_device *dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each (&devices, index, dev) { + unsigned long c_index = 0; + + /* Hold nets_rwsem so that any other thread modifying this + * system param can sync with this thread. */ - device->dma_device = &device->dev; - if (!device->dev.dma_mask) { - if (parent) - device->dev.dma_mask = parent->dma_mask; - else - WARN_ON_ONCE(true); - } - if (!device->dev.coherent_dma_mask) { - if (parent) - device->dev.coherent_dma_mask = - parent->coherent_dma_mask; - else - WARN_ON_ONCE(true); + down_read(&rdma_nets_rwsem); + xa_for_each (&dev->compat_devs, c_index, cdev) + remove_one_compat_dev(dev, c_index); + up_read(&rdma_nets_rwsem); + } + up_read(&devices_rwsem); +} + +static int add_all_compat_devs(void) +{ + struct rdma_dev_net *rnet; + struct ib_device *dev; + unsigned long index; + int ret = 0; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + unsigned long net_index = 0; + + /* Hold nets_rwsem so that any other thread modifying this + * system param can sync with this thread. + */ + down_read(&rdma_nets_rwsem); + xa_for_each (&rdma_nets, net_index, rnet) { + ret = add_one_compat_dev(dev, rnet); + if (ret) + break; } - } else { + up_read(&rdma_nets_rwsem); + } + up_read(&devices_rwsem); + if (ret) + remove_all_compat_devs(); + return ret; +} + +int rdma_compatdev_set(u8 enable) +{ + struct rdma_dev_net *rnet; + unsigned long index; + int ret = 0; + + down_write(&rdma_nets_rwsem); + if (ib_devices_shared_netns == enable) { + up_write(&rdma_nets_rwsem); + return 0; + } + + /* enable/disable of compat devices is not supported + * when more than default init_net exists. + */ + xa_for_each (&rdma_nets, index, rnet) { + ret++; + break; + } + if (!ret) + ib_devices_shared_netns = enable; + up_write(&rdma_nets_rwsem); + if (ret) + return -EBUSY; + + if (enable) + ret = add_all_compat_devs(); + else + remove_all_compat_devs(); + return ret; +} + +static void rdma_dev_exit_net(struct net *net) +{ + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + struct ib_device *dev; + unsigned long index; + int ret; + + down_write(&rdma_nets_rwsem); + /* + * Prevent the ID from being re-used and hide the id from xa_for_each. + */ + ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); + WARN_ON(ret); + up_write(&rdma_nets_rwsem); + + down_read(&devices_rwsem); + xa_for_each (&devices, index, dev) { + get_device(&dev->dev); + /* + * Release the devices_rwsem so that pontentially blocking + * device_del, doesn't hold the devices_rwsem for too long. + */ + up_read(&devices_rwsem); + + remove_one_compat_dev(dev, rnet->id); + /* - * The caller did not provide custom DMA operations. Use the - * DMA mapping operations of the parent device. + * If the real device is in the NS then move it back to init. */ - WARN_ON_ONCE(!parent); - device->dma_device = parent; + rdma_dev_change_netns(dev, net, &init_net); + + put_device(&dev->dev); + down_read(&devices_rwsem); } + up_read(&devices_rwsem); + + rdma_nl_net_exit(rnet); + xa_erase(&rdma_nets, rnet->id); } -static void cleanup_device(struct ib_device *device) +static __net_init int rdma_dev_init_net(struct net *net) { - ib_cache_cleanup_one(device); - ib_cache_release_one(device); - kfree(device->port_pkey_list); - kfree(device->port_immutable); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + unsigned long index; + struct ib_device *dev; + int ret; + + write_pnet(&rnet->net, net); + + ret = rdma_nl_net_init(rnet); + if (ret) + return ret; + + /* No need to create any compat devices in default init_net. */ + if (net_eq(net, &init_net)) + return 0; + + ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); + if (ret) { + rdma_nl_net_exit(rnet); + return ret; + } + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + /* Hold nets_rwsem so that netlink command cannot change + * system configuration for device sharing mode. + */ + down_read(&rdma_nets_rwsem); + ret = add_one_compat_dev(dev, rnet); + up_read(&rdma_nets_rwsem); + if (ret) + break; + } + up_read(&devices_rwsem); + + if (ret) + rdma_dev_exit_net(net); + + return ret; +} + +/* + * Assign the unique string device name and the unique device index. This is + * undone by ib_dealloc_device. + */ +static int assign_name(struct ib_device *device, const char *name) +{ + static u32 last_id; + int ret; + + down_write(&devices_rwsem); + /* Assign a unique name to the device */ + if (strchr(name, '%')) + ret = alloc_name(device, name); + else + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; + + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + + ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret > 0) + ret = 0; + +out: + up_write(&devices_rwsem); + return ret; } +/* + * setup_device() allocates memory and sets up data that requires calling the + * device ops, this is the only reason these actions are not done during + * ib_alloc_device. It is undone by ib_dealloc_device(). + */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; - ret = ib_device_check_mandatory(device); - if (ret) - return ret; + ib_device_check_mandatory(device); - ret = read_port_immutable(device); + ret = setup_port_data(device); if (ret) { - dev_warn(&device->dev, - "Couldn't create per port immutable data\n"); + dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } @@ -541,164 +1288,554 @@ static int setup_device(struct ib_device *device) if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); - goto port_cleanup; + return ret; } - ret = setup_port_pkey_list(device); - if (ret) { - dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - goto port_cleanup; + return 0; +} + +static void disable_device(struct ib_device *device) +{ + u32 cid; + + WARN_ON(!refcount_read(&device->refcount)); + + down_write(&devices_rwsem); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + /* + * Remove clients in LIFO order, see assign_client_id. This could be + * more efficient if xarray learns to reverse iterate. Since no new + * clients can be added to this ib_device past this point we only need + * the maximum possible client_id value here. + */ + down_read(&clients_rwsem); + cid = highest_client_id; + up_read(&clients_rwsem); + while (cid) { + cid--; + remove_client_context(device, cid); } - ret = ib_cache_setup_one(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto pkey_cleanup; + ib_cq_pool_cleanup(device); + + /* Pairs with refcount_set in enable_device */ + ib_device_put(device); + wait_for_completion(&device->unreg_completion); + + /* + * compat devices must be removed after device refcount drops to zero. + * Otherwise init_net() may add more compatdevs after removing compat + * devices and before device is disabled. + */ + remove_compat_devs(device); +} + +/* + * An enabled device is visible to all clients and to all the public facing + * APIs that return a device pointer. This always returns with a new get, even + * if it fails. + */ +static int enable_device_and_get(struct ib_device *device) +{ + struct ib_client *client; + unsigned long index; + int ret = 0; + + /* + * One ref belongs to the xa and the other belongs to this + * thread. This is needed to guard against parallel unregistration. + */ + refcount_set(&device->refcount, 2); + down_write(&devices_rwsem); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); + + /* + * By using downgrade_write() we ensure that no other thread can clear + * DEVICE_REGISTERED while we are completing the client setup. + */ + downgrade_write(&devices_rwsem); + + if (device->ops.enable_driver) { + ret = device->ops.enable_driver(device); + if (ret) + goto out; } - return 0; -pkey_cleanup: - kfree(device->port_pkey_list); -port_cleanup: - kfree(device->port_immutable); + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + ret = add_client_context(device, client); + if (ret) + break; + } + up_read(&clients_rwsem); + if (!ret) + ret = add_compat_devs(device); +out: + up_read(&devices_rwsem); return ret; } +static void prevent_dealloc_device(struct ib_device *ib_dev) +{ +} + +static void ib_device_notify_register(struct ib_device *device) +{ + struct net_device *netdev; + u32 port; + int ret; + + down_read(&devices_rwsem); + + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); + if (ret) + goto out; + + rdma_for_each_port(device, port) { + netdev = ib_device_get_netdev(device, port); + if (!netdev) + continue; + + ret = rdma_nl_notify_event(device, port, + RDMA_NETDEV_ATTACH_EVENT); + dev_put(netdev); + if (ret) + goto out; + } + +out: + up_read(&devices_rwsem); +} + /** * ib_register_device - Register an IB device with IB core - * @device:Device to register + * @device: Device to register + * @name: unique string device name. This may include a '%' which will + * cause a unique index to be added to the passed device name. + * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB + * device will be used. In this case the caller should fully + * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). + * + * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() + * asynchronously then the device pointer may become freed as soon as this + * function returns. */ int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)) + struct device *dma_device) { int ret; - struct ib_client *client; - setup_dma_device(device); + ret = assign_name(device, name); + if (ret) + return ret; - mutex_lock(&device_mutex); + /* + * If the caller does not provide a DMA capable device then the IB core + * will set up ib_sge and scatterlist structures that stash the kernel + * virtual address into the address field. + */ + WARN_ON(dma_device && !dma_device->dma_parms); + device->dma_device = dma_device; - if (strchr(name, '%')) { - ret = alloc_name(device, name); - if (ret) - goto out; - } else { - ret = dev_set_name(&device->dev, name); - if (ret) - goto out; - } - if (__ib_device_get_by_name(dev_name(&device->dev))) { - ret = -ENFILE; - goto out; + ret = setup_device(device); + if (ret) + return ret; + + ret = ib_cache_setup_one(device); + if (ret) { + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); + return ret; } - strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); - ret = setup_device(device); + device->groups[0] = &ib_dev_attr_group; + device->groups[1] = device->ops.device_group; + ret = ib_setup_device_attrs(device); if (ret) - goto out; + goto cache_cleanup; + + ib_device_register_rdmacg(device); - device->index = __dev_new_index(); + rdma_counter_init(device); + + /* + * Ensure that ADD uevent is not fired because it + * is too early amd device is not initialized yet. + */ + dev_set_uevent_suppress(&device->dev, true); + ret = device_add(&device->dev); + if (ret) + goto cg_cleanup; - ret = ib_device_register_rdmacg(device); + ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, - "Couldn't register device with rdma cgroup\n"); + "Couldn't register device with driver model\n"); goto dev_cleanup; } - ret = ib_device_register_sysfs(device, port_callback); + ret = enable_device_and_get(device); if (ret) { - dev_warn(&device->dev, - "Couldn't register device with driver model\n"); - goto cg_cleanup; + void (*dealloc_fn)(struct ib_device *); + + /* + * If we hit this error flow then we don't want to + * automatically dealloc the device since the caller is + * expected to call ib_dealloc_device() after + * ib_register_device() fails. This is tricky due to the + * possibility for a parallel unregistration along with this + * error flow. Since we have a refcount here we know any + * parallel flow is stopped in disable_device and will see the + * special dealloc_driver pointer, causing the responsibility to + * ib_dealloc_device() to revert back to this thread. + */ + dealloc_fn = device->ops.dealloc_driver; + device->ops.dealloc_driver = prevent_dealloc_device; + ib_device_put(device); + __ib_unregister_device(device); + device->ops.dealloc_driver = dealloc_fn; + dev_set_uevent_suppress(&device->dev, false); + return ret; } + dev_set_uevent_suppress(&device->dev, false); - device->reg_state = IB_DEV_REGISTERED; + ib_device_notify_register(device); - list_for_each_entry(client, &client_list, list) - if (!add_client_context(device, client) && client->add) - client->add(device); + ib_device_put(device); - down_write(&lists_rwsem); - list_add_tail(&device->core_list, &device_list); - up_write(&lists_rwsem); - mutex_unlock(&device_mutex); return 0; +dev_cleanup: + device_del(&device->dev); cg_cleanup: + dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); -dev_cleanup: - cleanup_device(device); -out: - mutex_unlock(&device_mutex); +cache_cleanup: + ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); +/* Callers must hold a get on the device. */ +static void __ib_unregister_device(struct ib_device *ib_dev) +{ + struct ib_device *sub, *tmp; + + mutex_lock(&ib_dev->subdev_lock); + list_for_each_entry_safe_reverse(sub, tmp, + &ib_dev->subdev_list_head, + subdev_list) { + list_del(&sub->subdev_list); + ib_dev->ops.del_sub_dev(sub); + ib_device_put(ib_dev); + } + mutex_unlock(&ib_dev->subdev_lock); + + /* + * We have a registration lock so that all the calls to unregister are + * fully fenced, once any unregister returns the device is truly + * unregistered even if multiple callers are unregistering it at the + * same time. This also interacts with the registration flow and + * provides sane semantics if register and unregister are racing. + */ + mutex_lock(&ib_dev->unregistration_lock); + if (!refcount_read(&ib_dev->refcount)) + goto out; + + disable_device(ib_dev); + rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); + + /* Expedite removing unregistered pointers from the hash table */ + free_netdevs(ib_dev); + + ib_free_port_attrs(&ib_dev->coredev); + device_del(&ib_dev->dev); + ib_device_unregister_rdmacg(ib_dev); + ib_cache_cleanup_one(ib_dev); + + /* + * Drivers using the new flow may not call ib_dealloc_device except + * in error unwind prior to registration success. + */ + if (ib_dev->ops.dealloc_driver && + ib_dev->ops.dealloc_driver != prevent_dealloc_device) { + WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); + ib_dealloc_device(ib_dev); + } +out: + mutex_unlock(&ib_dev->unregistration_lock); +} + /** * ib_unregister_device - Unregister an IB device - * @device:Device to unregister + * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. + * + * Callers should call this routine only once, and protect against races with + * registration. Typically it should only be called as part of a remove + * callback in an implementation of driver core's struct device_driver and + * related. + * + * If ops.dealloc_driver is used then ib_dev will be freed upon return from + * this function. */ -void ib_unregister_device(struct ib_device *device) +void ib_unregister_device(struct ib_device *ib_dev) { - struct ib_client_data *context, *tmp; - unsigned long flags; + get_device(&ib_dev->dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device); + +/** + * ib_unregister_device_and_put - Unregister a device while holding a 'get' + * @ib_dev: The device to unregister + * + * This is the same as ib_unregister_device(), except it includes an internal + * ib_device_put() that should match a 'get' obtained by the caller. + * + * It is safe to call this routine concurrently from multiple threads while + * holding the 'get'. When the function returns the device is fully + * unregistered. + * + * Drivers using this flow MUST use the driver_unregister callback to clean up + * their resources associated with the device and dealloc it. + */ +void ib_unregister_device_and_put(struct ib_device *ib_dev) +{ + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + ib_device_put(ib_dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_and_put); + +/** + * ib_unregister_driver - Unregister all IB devices for a driver + * @driver_id: The driver to unregister + * + * This implements a fence for device unregistration. It only returns once all + * devices associated with the driver_id have fully completed their + * unregistration and returned from ib_unregister_device*(). + * + * If device's are not yet unregistered it goes ahead and starts unregistering + * them. + * + * This does not block creation of new devices with the given driver_id, that + * is the responsibility of the caller. + */ +void ib_unregister_driver(enum rdma_driver_id driver_id) +{ + struct ib_device *ib_dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each (&devices, index, ib_dev) { + if (ib_dev->ops.driver_id != driver_id) + continue; + + get_device(&ib_dev->dev); + up_read(&devices_rwsem); + + WARN_ON(!ib_dev->ops.dealloc_driver); + __ib_unregister_device(ib_dev); + + put_device(&ib_dev->dev); + down_read(&devices_rwsem); + } + up_read(&devices_rwsem); +} +EXPORT_SYMBOL(ib_unregister_driver); + +static void ib_unregister_work(struct work_struct *work) +{ + struct ib_device *ib_dev = + container_of(work, struct ib_device, unregistration_work); + + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} + +/** + * ib_unregister_device_queued - Unregister a device using a work queue + * @ib_dev: The device to unregister + * + * This schedules an asynchronous unregistration using a WQ for the device. A + * driver should use this to avoid holding locks while doing unregistration, + * such as holding the RTNL lock. + * + * Drivers using this API must use ib_unregister_driver before module unload + * to ensure that all scheduled unregistrations have completed. + */ +void ib_unregister_device_queued(struct ib_device *ib_dev) +{ + WARN_ON(!refcount_read(&ib_dev->refcount)); + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_queued); + +/* + * The caller must pass in a device that has the kref held and the refcount + * released. If the device is in cur_net and still registered then it is moved + * into net. + */ +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net) +{ + int ret2 = -EINVAL; + int ret; + + mutex_lock(&device->unregistration_lock); /* - * Wait for all netlink command callers to finish working on the - * device. + * If a device not under ib_device_get() or if the unregistration_lock + * is not held, the namespace can be changed, or it can be unregistered. + * Check again under the lock. */ - ib_device_put(device); - wait_for_completion(&device->unreg_completion); + if (refcount_read(&device->refcount) == 0 || + !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { + ret = -ENODEV; + goto out; + } - mutex_lock(&device_mutex); + kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); + disable_device(device); - down_write(&lists_rwsem); - list_del(&device->core_list); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - context->going_down = true; - write_unlock_irq(&device->client_data_lock); - downgrade_write(&lists_rwsem); + /* + * At this point no one can be using the device, so it is safe to + * change the namespace. + */ + write_pnet(&device->coredev.rdma_net, net); - list_for_each_entry(context, &device->client_data_list, list) { - if (context->client->remove) - context->client->remove(device, context->data); + down_read(&devices_rwsem); + /* + * Currently rdma devices are system wide unique. So the device name + * is guaranteed free in the new namespace. Publish the new namespace + * at the sysfs level. + */ + ret = device_rename(&device->dev, dev_name(&device->dev)); + up_read(&devices_rwsem); + if (ret) { + dev_warn(&device->dev, + "%s: Couldn't rename device after namespace change\n", + __func__); + /* Try and put things back and re-enable the device */ + write_pnet(&device->coredev.rdma_net, cur_net); } - up_read(&lists_rwsem); - ib_device_unregister_sysfs(device); - ib_device_unregister_rdmacg(device); + ret2 = enable_device_and_get(device); + if (ret2) { + /* + * This shouldn't really happen, but if it does, let the user + * retry at later point. So don't disable the device. + */ + dev_warn(&device->dev, + "%s: Couldn't re-enable device after namespace change\n", + __func__); + } + kobject_uevent(&device->dev.kobj, KOBJ_ADD); - mutex_unlock(&device_mutex); + ib_device_put(device); +out: + mutex_unlock(&device->unregistration_lock); + if (ret) + return ret; + return ret2; +} - ib_cache_cleanup_one(device); +int ib_device_set_netns_put(struct sk_buff *skb, + struct ib_device *dev, u32 ns_fd) +{ + struct net *net; + int ret; - ib_security_destroy_port_pkey_list(device); - kfree(device->port_pkey_list); + net = get_net_ns_by_fd(ns_fd); + if (IS_ERR(net)) { + ret = PTR_ERR(net); + goto net_err; + } + + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto ns_err; + } - down_write(&lists_rwsem); - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { - list_del(&context->list); - kfree(context); + /* + * All the ib_clients, including uverbs, are reset when the namespace is + * changed and this cannot be blocked waiting for userspace to do + * something, so disassociation is mandatory. + */ + if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { + ret = -EOPNOTSUPP; + goto ns_err; } - write_unlock_irqrestore(&device->client_data_lock, flags); - up_write(&lists_rwsem); - device->reg_state = IB_DEV_UNREGISTERED; + get_device(&dev->dev); + ib_device_put(dev); + ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); + put_device(&dev->dev); + + put_net(net); + return ret; + +ns_err: + put_net(net); +net_err: + ib_device_put(dev); + return ret; +} + +static struct pernet_operations rdma_dev_net_ops = { + .init = rdma_dev_init_net, + .exit = rdma_dev_exit_net, + .id = &rdma_dev_net_id, + .size = sizeof(struct rdma_dev_net), +}; + +static int assign_client_id(struct ib_client *client) +{ + int ret; + + lockdep_assert_held(&clients_rwsem); + /* + * The add/remove callbacks must be called in FIFO/LIFO order. To + * achieve this we assign client_ids so they are sorted in + * registration order. + */ + client->client_id = highest_client_id; + ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); + if (ret) + return ret; + + highest_client_id++; + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); + return 0; +} + +static void remove_client_id(struct ib_client *client) +{ + down_write(&clients_rwsem); + xa_erase(&clients, client->client_id); + for (; highest_client_id; highest_client_id--) + if (xa_load(&clients, highest_client_id - 1)) + break; + up_write(&clients_rwsem); } -EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client @@ -716,20 +1853,36 @@ EXPORT_SYMBOL(ib_unregister_device); int ib_register_client(struct ib_client *client) { struct ib_device *device; + unsigned long index; + bool need_unreg = false; + int ret; - mutex_lock(&device_mutex); - - list_for_each_entry(device, &device_list, core_list) - if (!add_client_context(device, client) && client->add) - client->add(device); - - down_write(&lists_rwsem); - list_add_tail(&client->list, &client_list); - up_write(&lists_rwsem); + refcount_set(&client->uses, 1); + init_completion(&client->uses_zero); - mutex_unlock(&device_mutex); + /* + * The devices_rwsem is held in write mode to ensure that a racing + * ib_register_device() sees a consisent view of clients and devices. + */ + down_write(&devices_rwsem); + down_write(&clients_rwsem); + ret = assign_client_id(client); + if (ret) + goto out; - return 0; + need_unreg = true; + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { + ret = add_client_context(device, client); + if (ret) + goto out; + } + ret = 0; +out: + up_write(&clients_rwsem); + up_write(&devices_rwsem); + if (need_unreg && ret) + ib_unregister_client(client); + return ret; } EXPORT_SYMBOL(ib_register_client); @@ -740,80 +1893,140 @@ EXPORT_SYMBOL(ib_register_client); * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. + * + * This is a full fence, once it returns no client callbacks will be called, + * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context; struct ib_device *device; + unsigned long index; - mutex_lock(&device_mutex); + down_write(&clients_rwsem); + ib_client_put(client); + xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); + up_write(&clients_rwsem); - down_write(&lists_rwsem); - list_del(&client->list); - up_write(&lists_rwsem); + /* We do not want to have locks while calling client->remove() */ + rcu_read_lock(); + xa_for_each (&devices, index, device) { + if (!ib_device_try_get(device)) + continue; + rcu_read_unlock(); - list_for_each_entry(device, &device_list, core_list) { - struct ib_client_data *found_context = NULL; + remove_client_context(device, client->client_id); - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->going_down = true; - found_context = context; - break; - } - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); + ib_device_put(device); + rcu_read_lock(); + } + rcu_read_unlock(); + + /* + * remove_client_context() is not a fence, it can return even though a + * removal is ongoing. Wait until all removals are completed. + */ + wait_for_completion(&client->uses_zero); + remove_client_id(client); +} +EXPORT_SYMBOL(ib_unregister_client); + +static int __ib_get_global_client_nl_info(const char *client_name, + struct ib_client_nl_info *res) +{ + struct ib_client *client; + unsigned long index; + int ret = -ENOENT; + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + if (strcmp(client->name, client_name) != 0) + continue; + if (!client->get_global_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_global_nl_info(res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&clients_rwsem); + return ret; +} - if (client->remove) - client->remove(device, found_context ? - found_context->data : NULL); +static int __ib_get_client_nl_info(struct ib_device *ibdev, + const char *client_name, + struct ib_client_nl_info *res) +{ + unsigned long index; + void *client_data; + int ret = -ENOENT; - if (!found_context) { - dev_warn(&device->dev, - "No client context found for %s\n", - client->name); + down_read(&ibdev->client_data_rwsem); + xan_for_each_marked (&ibdev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || strcmp(client->name, client_name) != 0) continue; + if (!client->get_nl_info) { + ret = -EOPNOTSUPP; + break; } + ret = client->get_nl_info(ibdev, client_data, res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_del(&found_context->list); - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); - kfree(found_context); + /* + * The cdev is guaranteed valid as long as we are inside the + * client_data_rwsem as remove_one can't be called. Keep it + * valid for the caller. + */ + if (!ret && res->cdev) + get_device(res->cdev); + break; } + up_read(&ibdev->client_data_rwsem); - mutex_unlock(&device_mutex); + return ret; } -EXPORT_SYMBOL(ib_unregister_client); /** - * ib_get_client_data - Get IB client context - * @device:Device to get context for - * @client:Client to get context for - * - * ib_get_client_data() returns client context set with - * ib_set_client_data(). + * ib_get_client_nl_info - Fetch the nl_info from a client + * @ibdev: IB device + * @client_name: Name of the client + * @res: Result of the query */ -void *ib_get_client_data(struct ib_device *device, struct ib_client *client) +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res) { - struct ib_client_data *context; - void *ret = NULL; - unsigned long flags; + int ret; - read_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - ret = context->data; - break; - } - read_unlock_irqrestore(&device->client_data_lock, flags); + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); +#ifdef CONFIG_MODULES + if (ret == -ENOENT) { + request_module("rdma-client-%s", client_name); + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); + } +#endif + if (ret) { + if (ret == -ENOENT) + return -EOPNOTSUPP; + return ret; + } - return ret; + if (WARN_ON(!res->cdev)) + return -EINVAL; + return 0; } -EXPORT_SYMBOL(ib_get_client_data); /** * ib_set_client_data - Set IB client context @@ -821,27 +2034,22 @@ EXPORT_SYMBOL(ib_get_client_data); * @client:Client to set context for * @data:Context to set * - * ib_set_client_data() sets client context that can be retrieved with - * ib_get_client_data(). + * ib_set_client_data() sets client context data that can be retrieved with + * ib_get_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { - struct ib_client_data *context; - unsigned long flags; - - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->data = data; - goto out; - } + void *rc; - dev_warn(&device->dev, "No client context found for %s\n", - client->name); + if (WARN_ON(IS_ERR(data))) + data = NULL; -out: - write_unlock_irqrestore(&device->client_data_lock, flags); + rc = xa_store(&device->client_data, client->client_id, data, + GFP_KERNEL); + WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); @@ -851,17 +2059,15 @@ EXPORT_SYMBOL(ib_set_client_data); * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in - * chapter 11 of the InfiniBand Architecture Specification). This - * callback may occur in interrupt context. + * chapter 11 of the InfiniBand Architecture Specification). This + * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { - unsigned long flags; - - spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); + down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); - spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); + up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); @@ -874,35 +2080,83 @@ EXPORT_SYMBOL(ib_register_event_handler); */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { - unsigned long flags; - - spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); + down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); - spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); + up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); -/** - * ib_dispatch_event - Dispatch an asynchronous event - * @event:Event to dispatch - * - * Low-level drivers must call ib_dispatch_event() to dispatch the - * event to all registered event handlers when an asynchronous event - * occurs. - */ -void ib_dispatch_event(struct ib_event *event) +void ib_dispatch_event_clients(struct ib_event *event) { - unsigned long flags; struct ib_event_handler *handler; - spin_lock_irqsave(&event->device->event_handler_lock, flags); + down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); - spin_unlock_irqrestore(&event->device->event_handler_lock, flags); + up_read(&event->device->event_handler_rwsem); +} + +static int iw_query_port(struct ib_device *device, + u32 port_num, + struct ib_port_attr *port_attr) +{ + struct in_device *inetdev; + struct net_device *netdev; + + memset(port_attr, 0, sizeof(*port_attr)); + + netdev = ib_device_get_netdev(device, port_num); + if (!netdev) + return -ENODEV; + + port_attr->max_mtu = IB_MTU_4096; + port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); + + if (!netif_carrier_ok(netdev)) { + port_attr->state = IB_PORT_DOWN; + port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + } else { + rcu_read_lock(); + inetdev = __in_dev_get_rcu(netdev); + + if (inetdev && inetdev->ifa_list) { + port_attr->state = IB_PORT_ACTIVE; + port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + } else { + port_attr->state = IB_PORT_INIT; + port_attr->phys_state = + IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; + } + + rcu_read_unlock(); + } + + dev_put(netdev); + return device->ops.query_port(device, port_num, port_attr); +} + +static int __ib_query_port(struct ib_device *device, + u32 port_num, + struct ib_port_attr *port_attr) +{ + int err; + + memset(port_attr, 0, sizeof(*port_attr)); + + err = device->ops.query_port(device, port_num, port_attr); + if (err || port_attr->subnet_prefix) + return err; + + if (rdma_port_get_link_layer(device, port_num) != + IB_LINK_LAYER_INFINIBAND) + return 0; + + ib_get_cached_subnet_prefix(device, port_num, + &port_attr->subnet_prefix); + return 0; } -EXPORT_SYMBOL(ib_dispatch_event); /** * ib_query_port - Query IB port attributes @@ -914,31 +2168,227 @@ EXPORT_SYMBOL(ib_dispatch_event); * @port_attr pointer. */ int ib_query_port(struct ib_device *device, - u8 port_num, + u32 port_num, struct ib_port_attr *port_attr) { - union ib_gid gid; - int err; - if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - memset(port_attr, 0, sizeof(*port_attr)); - err = device->ops.query_port(device, port_num, port_attr); - if (err || port_attr->subnet_prefix) - return err; + if (rdma_protocol_iwarp(device, port_num)) + return iw_query_port(device, port_num, port_attr); + else + return __ib_query_port(device, port_num, port_attr); +} +EXPORT_SYMBOL(ib_query_port); + +static void add_ndev_hash(struct ib_port_data *pdata) +{ + unsigned long flags; + + might_sleep(); - if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) + spin_lock_irqsave(&ndev_hash_lock, flags); + if (hash_hashed(&pdata->ndev_hash_link)) { + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock_irqrestore(&ndev_hash_lock, flags); + /* + * We cannot do hash_add_rcu after a hash_del_rcu until the + * grace period + */ + synchronize_rcu(); + spin_lock_irqsave(&ndev_hash_lock, flags); + } + if (pdata->netdev) + hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, + (uintptr_t)pdata->netdev); + spin_unlock_irqrestore(&ndev_hash_lock, flags); +} + +/** + * ib_device_set_netdev - Associate the ib_dev with an underlying net_device + * @ib_dev: Device to modify + * @ndev: net_device to affiliate, may be NULL + * @port: IB port the net_device is connected to + * + * Drivers should use this to link the ib_device to a netdev so the netdev + * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be + * affiliated with any port. + * + * The caller must ensure that the given ndev is not unregistered or + * unregistering, and that either the ib_device is unregistered or + * ib_device_set_netdev() is called with NULL when the ndev sends a + * NETDEV_UNREGISTER event. + */ +int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + u32 port) +{ + enum rdma_nl_notify_event_type etype; + struct net_device *old_ndev; + struct ib_port_data *pdata; + unsigned long flags; + int ret; + + if (!rdma_is_port_valid(ib_dev, port)) + return -EINVAL; + + /* + * Drivers wish to call this before ib_register_driver, so we have to + * setup the port data early. + */ + ret = alloc_port_data(ib_dev); + if (ret) + return ret; + + pdata = &ib_dev->port_data[port]; + spin_lock_irqsave(&pdata->netdev_lock, flags); + old_ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (old_ndev == ndev) { + spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; + } - err = device->ops.query_gid(device, port_num, 0, &gid); - if (err) - return err; + rcu_assign_pointer(pdata->netdev, ndev); + netdev_put(old_ndev, &pdata->netdev_tracker); + netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + + add_ndev_hash(pdata); + + /* Make sure that the device is registered before we send events */ + if (xa_load(&devices, ib_dev->index) != ib_dev) + return 0; + + etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; + rdma_nl_notify_event(ib_dev, port, etype); - port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); return 0; } -EXPORT_SYMBOL(ib_query_port); +EXPORT_SYMBOL(ib_device_set_netdev); + +static void free_netdevs(struct ib_device *ib_dev) +{ + unsigned long flags; + u32 port; + + if (!ib_dev->port_data) + return; + + rdma_for_each_port (ib_dev, port) { + struct ib_port_data *pdata = &ib_dev->port_data[port]; + struct net_device *ndev; + + spin_lock_irqsave(&pdata->netdev_lock, flags); + ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (ndev) { + spin_lock(&ndev_hash_lock); + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock(&ndev_hash_lock); + + /* + * If this is the last dev_put there is still a + * synchronize_rcu before the netdev is kfreed, so we + * can continue to rely on unlocked pointer + * comparisons after the put + */ + rcu_assign_pointer(pdata->netdev, NULL); + netdev_put(ndev, &pdata->netdev_tracker); + } + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + } +} + +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + u32 port) +{ + struct ib_port_data *pdata; + struct net_device *res; + + if (!rdma_is_port_valid(ib_dev, port)) + return NULL; + + if (!ib_dev->port_data) + return NULL; + + pdata = &ib_dev->port_data[port]; + + /* + * New drivers should use ib_device_set_netdev() not the legacy + * get_netdev(). + */ + if (ib_dev->ops.get_netdev) + res = ib_dev->ops.get_netdev(ib_dev, port); + else { + spin_lock(&pdata->netdev_lock); + res = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + dev_hold(res); + spin_unlock(&pdata->netdev_lock); + } + + return res; +} +EXPORT_SYMBOL(ib_device_get_netdev); + +/** + * ib_query_netdev_port - Query the port number of a net_device + * associated with an ibdev + * @ibdev: IB device + * @ndev: Network device + * @port: IB port the net_device is connected to + */ +int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, + u32 *port) +{ + struct net_device *ib_ndev; + u32 port_num; + + rdma_for_each_port(ibdev, port_num) { + ib_ndev = ib_device_get_netdev(ibdev, port_num); + if (ndev == ib_ndev) { + *port = port_num; + dev_put(ib_ndev); + return 0; + } + dev_put(ib_ndev); + } + + return -ENOENT; +} +EXPORT_SYMBOL(ib_query_netdev_port); + +/** + * ib_device_get_by_netdev - Find an IB device associated with a netdev + * @ndev: netdev to locate + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device that is associated with a netdev via + * ib_device_set_netdev(). The caller must call ib_device_put() on the + * returned pointer. + */ +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id) +{ + struct ib_device *res = NULL; + struct ib_port_data *cur; + + rcu_read_lock(); + hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, + (uintptr_t)ndev) { + if (rcu_access_pointer(cur->netdev) == ndev && + (driver_id == RDMA_DRIVER_UNKNOWN || + cur->ib_dev->ops.driver_id == driver_id) && + ib_device_try_get(cur->ib_dev)) { + res = cur->ib_dev; + break; + } + } + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports @@ -958,27 +2408,16 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_callback cb, void *cookie) { - u8 port; + u32 port; - for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); - port++) + rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { - struct net_device *idev = NULL; - - if (ib_dev->ops.get_netdev) - idev = ib_dev->ops.get_netdev(ib_dev, port); - - if (idev && - idev->reg_state >= NETREG_UNREGISTERED) { - dev_put(idev); - idev = NULL; - } + struct net_device *idev = + ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); - - if (idev) - dev_put(idev); + dev_put(idev); } } @@ -999,14 +2438,15 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *cookie) { struct ib_device *dev; + unsigned long index; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); - up_read(&lists_rwsem); + up_read(&devices_rwsem); } -/** +/* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * @@ -1015,19 +2455,22 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { + unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) + continue; + ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } - - up_read(&lists_rwsem); + up_read(&devices_rwsem); return ret; } @@ -1041,11 +2484,14 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, - u8 port_num, u16 index, u16 *pkey) + u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; + if (!device->ops.query_pkey) + return -EOPNOTSUPP; + return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); @@ -1064,7 +2510,7 @@ int ib_modify_device(struct ib_device *device, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) - return -ENOSYS; + return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); @@ -1083,7 +2529,7 @@ EXPORT_SYMBOL(ib_modify_device); * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, - u8 port_num, int port_modify_mask, + u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; @@ -1095,8 +2541,12 @@ int ib_modify_port(struct ib_device *device, rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); + else if (rdma_protocol_roce(device, port_num) && + ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || + (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) + rc = 0; else - rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; + rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); @@ -1111,19 +2561,22 @@ EXPORT_SYMBOL(ib_modify_port); * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - u8 *port_num, u16 *index) + u32 *port_num, u16 *index) { union ib_gid tmp_gid; - int ret, port, i; + u32 port; + int ret, i; - for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; - for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { + for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; + ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) - return ret; + continue; + if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) @@ -1146,13 +2599,14 @@ EXPORT_SYMBOL(ib_find_gid); * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, - u8 port_num, u16 pkey, u16 *index) + u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; - for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { + for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; + ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; @@ -1185,37 +2639,39 @@ EXPORT_SYMBOL(ib_find_pkey); * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. + * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, - u8 port, + u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; - struct ib_client_data *context; + unsigned long index; + void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; - down_read(&lists_rwsem); - - list_for_each_entry(context, &dev->client_data_list, list) { - struct ib_client *client = context->client; + /* + * Holding the read side guarantees that the client will not become + * unregistered while we are calling get_net_dev_by_params() + */ + down_read(&dev->client_data_rwsem); + xan_for_each_marked (&dev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); - if (context->going_down) + if (!client || !client->get_net_dev_by_params) continue; - if (client->get_net_dev_by_params) { - net_dev = client->get_net_dev_by_params(dev, port, pkey, - gid, addr, - context->data); - if (net_dev) - break; - } + net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, + addr, client_data); + if (net_dev) + break; } - - up_read(&lists_rwsem); + up_read(&dev->client_data_rwsem); return net_dev; } @@ -1231,12 +2687,32 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) (ptr)->name = ops->name; \ } while (0) +#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + + if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { + WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && + dev_ops->driver_id != ops->driver_id); + dev_ops->driver_id = ops->driver_id; + } + if (ops->owner) { + WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); + dev_ops->owner = ops->owner; + } + if (ops->uverbs_abi_ver) + dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; + + dev_ops->uverbs_no_driver_id_binding |= + ops->uverbs_no_driver_id_binding; + SET_DEVICE_OP(dev_ops, add_gid); + SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); - SET_DEVICE_OP(dev_ops, alloc_fmr); - SET_DEVICE_OP(dev_ops, alloc_hw_stats); + SET_DEVICE_OP(dev_ops, alloc_dmah); + SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); + SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); + SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); @@ -1244,22 +2720,31 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); + SET_DEVICE_OP(dev_ops, counter_alloc_stats); + SET_DEVICE_OP(dev_ops, counter_bind_qp); + SET_DEVICE_OP(dev_ops, counter_dealloc); + SET_DEVICE_OP(dev_ops, counter_init); + SET_DEVICE_OP(dev_ops, counter_unbind_qp); + SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); + SET_DEVICE_OP(dev_ops, create_cq_umem); SET_DEVICE_OP(dev_ops, create_flow); - SET_DEVICE_OP(dev_ops, create_flow_action_esp); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); + SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); - SET_DEVICE_OP(dev_ops, dealloc_fmr); + SET_DEVICE_OP(dev_ops, dealloc_dmah); + SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); + SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); @@ -1270,32 +2755,58 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); + SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); + SET_DEVICE_OP(dev_ops, enable_driver); + SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); + SET_DEVICE_OP(dev_ops, fill_res_srq_entry); + SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); + SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); + SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); + SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); + SET_DEVICE_OP(dev_ops, iw_accept); + SET_DEVICE_OP(dev_ops, iw_add_ref); + SET_DEVICE_OP(dev_ops, iw_connect); + SET_DEVICE_OP(dev_ops, iw_create_listen); + SET_DEVICE_OP(dev_ops, iw_destroy_listen); + SET_DEVICE_OP(dev_ops, iw_get_qp); + SET_DEVICE_OP(dev_ops, iw_reject); + SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); - SET_DEVICE_OP(dev_ops, map_phys_fmr); + SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); + SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); - SET_DEVICE_OP(dev_ops, modify_flow_action_esp); + SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); + SET_DEVICE_OP(dev_ops, pre_destroy_cq); SET_DEVICE_OP(dev_ops, poll_cq); + SET_DEVICE_OP(dev_ops, port_groups); + SET_DEVICE_OP(dev_ops, post_destroy_cq); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); @@ -1307,20 +2818,99 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); + SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); - SET_DEVICE_OP(dev_ops, req_ncomp_notif); + SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); - SET_DEVICE_OP(dev_ops, unmap_fmr); + SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); + SET_DEVICE_OP(dev_ops, report_port_event); + + SET_OBJ_SIZE(dev_ops, ib_ah); + SET_OBJ_SIZE(dev_ops, ib_counters); + SET_OBJ_SIZE(dev_ops, ib_cq); + SET_OBJ_SIZE(dev_ops, ib_dmah); + SET_OBJ_SIZE(dev_ops, ib_mw); + SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_qp); + SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); + SET_OBJ_SIZE(dev_ops, ib_srq); + SET_OBJ_SIZE(dev_ops, ib_ucontext); + SET_OBJ_SIZE(dev_ops, ib_xrcd); + SET_OBJ_SIZE(dev_ops, rdma_counter); } EXPORT_SYMBOL(ib_set_device_ops); +int ib_add_sub_device(struct ib_device *parent, + enum rdma_nl_dev_type type, + const char *name) +{ + struct ib_device *sub; + int ret = 0; + + if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) + return -EOPNOTSUPP; + + if (!ib_device_try_get(parent)) + return -EINVAL; + + sub = parent->ops.add_sub_dev(parent, type, name); + if (IS_ERR(sub)) { + ib_device_put(parent); + return PTR_ERR(sub); + } + + sub->type = type; + sub->parent = parent; + + mutex_lock(&parent->subdev_lock); + list_add_tail(&parent->subdev_list_head, &sub->subdev_list); + mutex_unlock(&parent->subdev_lock); + + return ret; +} +EXPORT_SYMBOL(ib_add_sub_device); + +int ib_del_sub_device_and_put(struct ib_device *sub) +{ + struct ib_device *parent = sub->parent; + + if (!parent) + return -EOPNOTSUPP; + + mutex_lock(&parent->subdev_lock); + list_del(&sub->subdev_list); + mutex_unlock(&parent->subdev_lock); + + ib_device_put(sub); + parent->ops.del_sub_dev(sub); + ib_device_put(parent); + + return 0; +} +EXPORT_SYMBOL(ib_del_sub_device_and_put); + +#ifdef CONFIG_INFINIBAND_VIRT_DMA +int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) { + sg_dma_address(s) = (uintptr_t)sg_virt(s); + sg_dma_len(s) = s->length; + } + return nents; +} +EXPORT_SYMBOL(ib_dma_virt_map_sg); +#endif /* CONFIG_INFINIBAND_VIRT_DMA */ + static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, @@ -1336,29 +2926,121 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { }, }; +void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) +{ + enum ib_port_state curr_state; + struct ib_event ibevent = {}; + u32 port; + + if (ib_query_netdev_port(ibdev, ndev, &port)) + return; + + curr_state = ib_get_curr_port_state(ndev); + + write_lock_irq(&ibdev->cache_lock); + if (ibdev->port_data[port].cache.last_port_state == curr_state) { + write_unlock_irq(&ibdev->cache_lock); + return; + } + ibdev->port_data[port].cache.last_port_state = curr_state; + write_unlock_irq(&ibdev->cache_lock); + + ibevent.event = (curr_state == IB_PORT_DOWN) ? + IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; + ibevent.device = ibdev; + ibevent.element.port_num = port; + ib_dispatch_event(&ibevent); +} +EXPORT_SYMBOL(ib_dispatch_port_state_event); + +static void handle_port_event(struct net_device *ndev, unsigned long event) +{ + struct ib_device *ibdev; + + /* Currently, link events in bonding scenarios are still + * reported by drivers that support bonding. + */ + if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) + return; + + ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); + if (!ibdev) + return; + + if (ibdev->ops.report_port_event) { + ibdev->ops.report_port_event(ibdev, ndev, event); + goto put_ibdev; + } + + ib_dispatch_port_state_event(ibdev, ndev); + +put_ibdev: + ib_device_put(ibdev); +}; + +static int ib_netdevice_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct ib_device *ibdev; + u32 port; + + switch (event) { + case NETDEV_CHANGENAME: + ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); + if (!ibdev) + return NOTIFY_DONE; + + if (ib_query_netdev_port(ibdev, ndev, &port)) { + ib_device_put(ibdev); + break; + } + + rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); + ib_device_put(ibdev); + break; + + case NETDEV_UP: + case NETDEV_CHANGE: + case NETDEV_DOWN: + handle_port_event(ndev, event); + break; + + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block nb_netdevice = { + .notifier_call = ib_netdevice_event, +}; + static int __init ib_core_init(void) { - int ret; + int ret = -ENOMEM; - ib_wq = alloc_workqueue("infiniband", 0, 0); + ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0); if (!ib_wq) return -ENOMEM; - ib_comp_wq = alloc_workqueue("ib-comp-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); - if (!ib_comp_wq) { - ret = -ENOMEM; + ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + if (!ib_unreg_wq) goto err; - } + + ib_comp_wq = alloc_workqueue("ib-comp-wq", + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0); + if (!ib_comp_wq) + goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); - if (!ib_comp_unbound_wq) { - ret = -ENOMEM; + if (!ib_comp_unbound_wq) goto err_comp; - } ret = class_register(&ib_class); if (ret) { @@ -1366,15 +3048,11 @@ static int __init ib_core_init(void) goto err_comp_unbound; } - ret = rdma_nl_init(); - if (ret) { - pr_warn("Couldn't init IB netlink interface: err %d\n", ret); - goto err_sysfs; - } + rdma_nl_init(); ret = addr_init(); if (ret) { - pr_warn("Could't init IB address resolution\n"); + pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } @@ -1390,18 +3068,36 @@ static int __init ib_core_init(void) goto err_mad; } - ret = register_lsm_notifier(&ibdev_lsm_nb); + ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } + ret = register_pernet_device(&rdma_dev_net_ops); + if (ret) { + pr_warn("Couldn't init compat dev. ret %d\n", ret); + goto err_compat; + } + nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); - roce_gid_mgmt_init(); + ret = roce_gid_mgmt_init(); + if (ret) { + pr_warn("Couldn't init RoCE GID management\n"); + goto err_parent; + } + + register_netdevice_notifier(&nb_netdevice); return 0; +err_parent: + rdma_nl_unregister(RDMA_NL_LS); + nldev_exit(); + unregister_pernet_device(&rdma_dev_net_ops); +err_compat: + unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: @@ -1409,13 +3105,13 @@ err_mad: err_addr: addr_cleanup(); err_ibnl: - rdma_nl_exit(); -err_sysfs: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); +err_unbound: + destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; @@ -1423,10 +3119,12 @@ err: static void __exit ib_core_cleanup(void) { + unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); - nldev_exit(); rdma_nl_unregister(RDMA_NL_LS); - unregister_lsm_notifier(&ibdev_lsm_nb); + nldev_exit(); + unregister_pernet_device(&rdma_dev_net_ops); + unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); @@ -1436,9 +3134,15 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); + destroy_workqueue(ib_unreg_wq); + WARN_ON(!xa_empty(&clients)); + WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); -subsys_initcall(ib_core_init); +/* ib core relies on netdev stack to first register net_ns_type_operations + * ns kobject type before ib_core initialization. + */ +fs_initcall(ib_core_init); module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c deleted file mode 100644 index 7d841b689a1e..000000000000 --- a/drivers/infiniband/core/fmr_pool.c +++ /dev/null @@ -1,507 +0,0 @@ -/* - * Copyright (c) 2004 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/jhash.h> -#include <linux/kthread.h> - -#include <rdma/ib_fmr_pool.h> - -#include "core_priv.h" - -#define PFX "fmr_pool: " - -enum { - IB_FMR_MAX_REMAPS = 32, - - IB_FMR_HASH_BITS = 8, - IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS, - IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1 -}; - -/* - * If an FMR is not in use, then the list member will point to either - * its pool's free_list (if the FMR can be mapped again; that is, - * remap_count < pool->max_remaps) or its pool's dirty_list (if the - * FMR needs to be unmapped before being remapped). In either of - * these cases it is a bug if the ref_count is not 0. In other words, - * if ref_count is > 0, then the list member must not be linked into - * either free_list or dirty_list. - * - * The cache_node member is used to link the FMR into a cache bucket - * (if caching is enabled). This is independent of the reference - * count of the FMR. When a valid FMR is released, its ref_count is - * decremented, and if ref_count reaches 0, the FMR is placed in - * either free_list or dirty_list as appropriate. However, it is not - * removed from the cache and may be "revived" if a call to - * ib_fmr_register_physical() occurs before the FMR is remapped. In - * this case we just increment the ref_count and remove the FMR from - * free_list/dirty_list. - * - * Before we remap an FMR from free_list, we remove it from the cache - * (to prevent another user from obtaining a stale FMR). When an FMR - * is released, we add it to the tail of the free list, so that our - * cache eviction policy is "least recently used." - * - * All manipulation of ref_count, list and cache_node is protected by - * pool_lock to maintain consistency. - */ - -struct ib_fmr_pool { - spinlock_t pool_lock; - - int pool_size; - int max_pages; - int max_remaps; - int dirty_watermark; - int dirty_len; - struct list_head free_list; - struct list_head dirty_list; - struct hlist_head *cache_bucket; - - void (*flush_function)(struct ib_fmr_pool *pool, - void * arg); - void *flush_arg; - - struct kthread_worker *worker; - struct kthread_work work; - - atomic_t req_ser; - atomic_t flush_ser; - - wait_queue_head_t force_wait; -}; - -static inline u32 ib_fmr_hash(u64 first_page) -{ - return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) & - (IB_FMR_HASH_SIZE - 1); -} - -/* Caller must hold pool_lock */ -static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, - u64 *page_list, - int page_list_len, - u64 io_virtual_address) -{ - struct hlist_head *bucket; - struct ib_pool_fmr *fmr; - - if (!pool->cache_bucket) - return NULL; - - bucket = pool->cache_bucket + ib_fmr_hash(*page_list); - - hlist_for_each_entry(fmr, bucket, cache_node) - if (io_virtual_address == fmr->io_virtual_address && - page_list_len == fmr->page_list_len && - !memcmp(page_list, fmr->page_list, - page_list_len * sizeof *page_list)) - return fmr; - - return NULL; -} - -static void ib_fmr_batch_release(struct ib_fmr_pool *pool) -{ - int ret; - struct ib_pool_fmr *fmr; - LIST_HEAD(unmap_list); - LIST_HEAD(fmr_list); - - spin_lock_irq(&pool->pool_lock); - - list_for_each_entry(fmr, &pool->dirty_list, list) { - hlist_del_init(&fmr->cache_node); - fmr->remap_count = 0; - list_add_tail(&fmr->fmr->list, &fmr_list); - -#ifdef DEBUG - if (fmr->ref_count !=0) { - pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n", - fmr, fmr->ref_count); - } -#endif - } - - list_splice_init(&pool->dirty_list, &unmap_list); - pool->dirty_len = 0; - - spin_unlock_irq(&pool->pool_lock); - - if (list_empty(&unmap_list)) { - return; - } - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); - - spin_lock_irq(&pool->pool_lock); - list_splice(&unmap_list, &pool->free_list); - spin_unlock_irq(&pool->pool_lock); -} - -static void ib_fmr_cleanup_func(struct kthread_work *work) -{ - struct ib_fmr_pool *pool = container_of(work, struct ib_fmr_pool, work); - - ib_fmr_batch_release(pool); - atomic_inc(&pool->flush_ser); - wake_up_interruptible(&pool->force_wait); - - if (pool->flush_function) - pool->flush_function(pool, pool->flush_arg); - - if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) - kthread_queue_work(pool->worker, &pool->work); -} - -/** - * ib_create_fmr_pool - Create an FMR pool - * @pd:Protection domain for FMRs - * @params:FMR pool parameters - * - * Create a pool of FMRs. Return value is pointer to new pool or - * error code if creation failed. - */ -struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, - struct ib_fmr_pool_param *params) -{ - struct ib_device *device; - struct ib_fmr_pool *pool; - int i; - int ret; - int max_remaps; - - if (!params) - return ERR_PTR(-EINVAL); - - device = pd->device; - if (!device->ops.alloc_fmr || !device->ops.dealloc_fmr || - !device->ops.map_phys_fmr || !device->ops.unmap_fmr) { - dev_info(&device->dev, "Device does not support FMRs\n"); - return ERR_PTR(-ENOSYS); - } - - if (!device->attrs.max_map_per_fmr) - max_remaps = IB_FMR_MAX_REMAPS; - else - max_remaps = device->attrs.max_map_per_fmr; - - pool = kmalloc(sizeof *pool, GFP_KERNEL); - if (!pool) - return ERR_PTR(-ENOMEM); - - pool->cache_bucket = NULL; - pool->flush_function = params->flush_function; - pool->flush_arg = params->flush_arg; - - INIT_LIST_HEAD(&pool->free_list); - INIT_LIST_HEAD(&pool->dirty_list); - - if (params->cache) { - pool->cache_bucket = - kmalloc_array(IB_FMR_HASH_SIZE, - sizeof(*pool->cache_bucket), - GFP_KERNEL); - if (!pool->cache_bucket) { - ret = -ENOMEM; - goto out_free_pool; - } - - for (i = 0; i < IB_FMR_HASH_SIZE; ++i) - INIT_HLIST_HEAD(pool->cache_bucket + i); - } - - pool->pool_size = 0; - pool->max_pages = params->max_pages_per_fmr; - pool->max_remaps = max_remaps; - pool->dirty_watermark = params->dirty_watermark; - pool->dirty_len = 0; - spin_lock_init(&pool->pool_lock); - atomic_set(&pool->req_ser, 0); - atomic_set(&pool->flush_ser, 0); - init_waitqueue_head(&pool->force_wait); - - pool->worker = - kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev)); - if (IS_ERR(pool->worker)) { - pr_warn(PFX "couldn't start cleanup kthread worker\n"); - ret = PTR_ERR(pool->worker); - goto out_free_pool; - } - kthread_init_work(&pool->work, ib_fmr_cleanup_func); - - { - struct ib_pool_fmr *fmr; - struct ib_fmr_attr fmr_attr = { - .max_pages = params->max_pages_per_fmr, - .max_maps = pool->max_remaps, - .page_shift = params->page_shift - }; - int bytes_per_fmr = sizeof *fmr; - - if (pool->cache_bucket) - bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64); - - for (i = 0; i < params->pool_size; ++i) { - fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); - if (!fmr) - goto out_fail; - - fmr->pool = pool; - fmr->remap_count = 0; - fmr->ref_count = 0; - INIT_HLIST_NODE(&fmr->cache_node); - - fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); - if (IS_ERR(fmr->fmr)) { - pr_warn(PFX "fmr_create failed for FMR %d\n", - i); - kfree(fmr); - goto out_fail; - } - - list_add_tail(&fmr->list, &pool->free_list); - ++pool->pool_size; - } - } - - return pool; - - out_free_pool: - kfree(pool->cache_bucket); - kfree(pool); - - return ERR_PTR(ret); - - out_fail: - ib_destroy_fmr_pool(pool); - - return ERR_PTR(-ENOMEM); -} -EXPORT_SYMBOL(ib_create_fmr_pool); - -/** - * ib_destroy_fmr_pool - Free FMR pool - * @pool:FMR pool to free - * - * Destroy an FMR pool and free all associated resources. - */ -void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) -{ - struct ib_pool_fmr *fmr; - struct ib_pool_fmr *tmp; - LIST_HEAD(fmr_list); - int i; - - kthread_destroy_worker(pool->worker); - ib_fmr_batch_release(pool); - - i = 0; - list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) { - if (fmr->remap_count) { - INIT_LIST_HEAD(&fmr_list); - list_add_tail(&fmr->fmr->list, &fmr_list); - ib_unmap_fmr(&fmr_list); - } - ib_dealloc_fmr(fmr->fmr); - list_del(&fmr->list); - kfree(fmr); - ++i; - } - - if (i < pool->pool_size) - pr_warn(PFX "pool still has %d regions registered\n", - pool->pool_size - i); - - kfree(pool->cache_bucket); - kfree(pool); -} -EXPORT_SYMBOL(ib_destroy_fmr_pool); - -/** - * ib_flush_fmr_pool - Invalidate all unmapped FMRs - * @pool:FMR pool to flush - * - * Ensure that all unmapped FMRs are fully invalidated. - */ -int ib_flush_fmr_pool(struct ib_fmr_pool *pool) -{ - int serial; - struct ib_pool_fmr *fmr, *next; - - /* - * The free_list holds FMRs that may have been used - * but have not been remapped enough times to be dirty. - * Put them on the dirty list now so that the cleanup - * thread will reap them too. - */ - spin_lock_irq(&pool->pool_lock); - list_for_each_entry_safe(fmr, next, &pool->free_list, list) { - if (fmr->remap_count > 0) - list_move(&fmr->list, &pool->dirty_list); - } - spin_unlock_irq(&pool->pool_lock); - - serial = atomic_inc_return(&pool->req_ser); - kthread_queue_work(pool->worker, &pool->work); - - if (wait_event_interruptible(pool->force_wait, - atomic_read(&pool->flush_ser) - serial >= 0)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL(ib_flush_fmr_pool); - -/** - * ib_fmr_pool_map_phys - Map an FMR from an FMR pool. - * @pool_handle: FMR pool to allocate FMR from - * @page_list: List of pages to map - * @list_len: Number of pages in @page_list - * @io_virtual_address: I/O virtual address for new FMR - */ -struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, - u64 *page_list, - int list_len, - u64 io_virtual_address) -{ - struct ib_fmr_pool *pool = pool_handle; - struct ib_pool_fmr *fmr; - unsigned long flags; - int result; - - if (list_len < 1 || list_len > pool->max_pages) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&pool->pool_lock, flags); - fmr = ib_fmr_cache_lookup(pool, - page_list, - list_len, - io_virtual_address); - if (fmr) { - /* found in cache */ - ++fmr->ref_count; - if (fmr->ref_count == 1) { - list_del(&fmr->list); - } - - spin_unlock_irqrestore(&pool->pool_lock, flags); - - return fmr; - } - - if (list_empty(&pool->free_list)) { - spin_unlock_irqrestore(&pool->pool_lock, flags); - return ERR_PTR(-EAGAIN); - } - - fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list); - list_del(&fmr->list); - hlist_del_init(&fmr->cache_node); - spin_unlock_irqrestore(&pool->pool_lock, flags); - - result = ib_map_phys_fmr(fmr->fmr, page_list, list_len, - io_virtual_address); - - if (result) { - spin_lock_irqsave(&pool->pool_lock, flags); - list_add(&fmr->list, &pool->free_list); - spin_unlock_irqrestore(&pool->pool_lock, flags); - - pr_warn(PFX "fmr_map returns %d\n", result); - - return ERR_PTR(result); - } - - ++fmr->remap_count; - fmr->ref_count = 1; - - if (pool->cache_bucket) { - fmr->io_virtual_address = io_virtual_address; - fmr->page_list_len = list_len; - memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list)); - - spin_lock_irqsave(&pool->pool_lock, flags); - hlist_add_head(&fmr->cache_node, - pool->cache_bucket + ib_fmr_hash(fmr->page_list[0])); - spin_unlock_irqrestore(&pool->pool_lock, flags); - } - - return fmr; -} -EXPORT_SYMBOL(ib_fmr_pool_map_phys); - -/** - * ib_fmr_pool_unmap - Unmap FMR - * @fmr:FMR to unmap - * - * Unmap an FMR. The FMR mapping may remain valid until the FMR is - * reused (or until ib_flush_fmr_pool() is called). - */ -void ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) -{ - struct ib_fmr_pool *pool; - unsigned long flags; - - pool = fmr->pool; - - spin_lock_irqsave(&pool->pool_lock, flags); - - --fmr->ref_count; - if (!fmr->ref_count) { - if (fmr->remap_count < pool->max_remaps) { - list_add_tail(&fmr->list, &pool->free_list); - } else { - list_add_tail(&fmr->list, &pool->dirty_list); - if (++pool->dirty_len >= pool->dirty_watermark) { - atomic_inc(&pool->req_ser); - kthread_queue_work(pool->worker, &pool->work); - } - } - } - -#ifdef DEBUG - if (fmr->ref_count < 0) - pr_warn(PFX "FMR %p has ref count %d < 0\n", - fmr, fmr->ref_count); -#endif - - spin_unlock_irqrestore(&pool->pool_lock, flags); -} -EXPORT_SYMBOL(ib_fmr_pool_unmap); diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c new file mode 100644 index 000000000000..b51bd7087a88 --- /dev/null +++ b/drivers/infiniband/core/ib_core_uverbs.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019 Marvell. All rights reserved. + */ +#include <linux/xarray.h> +#include "uverbs.h" +#include "core_priv.h" + +/** + * rdma_umap_priv_init() - Initialize the private data of a vma + * + * @priv: The already allocated private data + * @vma: The vm area struct that needs private data + * @entry: entry into the mmap_xa that needs to be linked with + * this vma + * + * Each time we map IO memory into user space this keeps track of the + * mapping. When the device is hot-unplugged we 'zap' the mmaps in user space + * to point to the zero page and allow the hot unplug to proceed. + * + * This is necessary for cases like PCI physical hot unplug as the actual BAR + * memory may vanish after this and access to it from userspace could MCE. + * + * RDMA drivers supporting disassociation must have their user space designed + * to cope in some way with their IO pages going to the zero page. + * + */ +void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma, + struct rdma_user_mmap_entry *entry) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + + priv->vma = vma; + if (entry) { + kref_get(&entry->ref); + priv->entry = entry; + } + vma->vm_private_data = priv; + /* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */ + + mutex_lock(&ufile->umap_lock); + list_add(&priv->list, &ufile->umaps); + mutex_unlock(&ufile->umap_lock); +} +EXPORT_SYMBOL(rdma_umap_priv_init); + +/** + * rdma_user_mmap_io() - Map IO memory into a process + * + * @ucontext: associated user context + * @vma: the vma related to the current mmap call + * @pfn: pfn to map + * @size: size to map + * @prot: pgprot to use in remap call + * @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL + * if mmap_entry is not used by the driver + * + * This is to be called by drivers as part of their mmap() functions if they + * wish to send something like PCI-E BAR memory to userspace. + * + * Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on + * success. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot, + struct rdma_user_mmap_entry *entry) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + struct rdma_umap_priv *priv; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + if (vma->vm_end - vma->vm_start != size) + return -EINVAL; + + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ + if (WARN_ON(!vma->vm_file || + vma->vm_file->private_data != ufile)) + return -EINVAL; + lockdep_assert_held(&ufile->device->disassociate_srcu); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma, entry); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_io); + +/** + * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa + * + * @ucontext: associated user context + * @pgoff: The mmap offset >> PAGE_SHIFT + * + * This function is called when a user tries to mmap with an offset (returned + * by rdma_user_mmap_get_offset()) it initially received from the driver. The + * rdma_user_mmap_entry was created by the function + * rdma_user_mmap_entry_insert(). This function increases the refcnt of the + * entry so that it won't be deleted from the xarray in the meantime. + * + * Return an reference to an entry if exists or NULL if there is no + * match. rdma_user_mmap_entry_put() must be called to put the reference. + */ +struct rdma_user_mmap_entry * +rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, + unsigned long pgoff) +{ + struct rdma_user_mmap_entry *entry; + + if (pgoff > U32_MAX) + return NULL; + + xa_lock(&ucontext->mmap_xa); + + entry = xa_load(&ucontext->mmap_xa, pgoff); + + /* + * If refcount is zero, entry is already being deleted, driver_removed + * indicates that the no further mmaps are possible and we waiting for + * the active VMAs to be closed. + */ + if (!entry || entry->start_pgoff != pgoff || entry->driver_removed || + !kref_get_unless_zero(&entry->ref)) + goto err; + + xa_unlock(&ucontext->mmap_xa); + + ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n", + pgoff, entry->npages); + + return entry; + +err: + xa_unlock(&ucontext->mmap_xa); + return NULL; +} +EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff); + +/** + * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa + * + * @ucontext: associated user context + * @vma: the vma being mmap'd into + * + * This function is like rdma_user_mmap_entry_get_pgoff() except that it also + * checks that the VMA is correct. + */ +struct rdma_user_mmap_entry * +rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, + struct vm_area_struct *vma) +{ + struct rdma_user_mmap_entry *entry; + + if (!(vma->vm_flags & VM_SHARED)) + return NULL; + entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff); + if (!entry) + return NULL; + if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) { + rdma_user_mmap_entry_put(entry); + return NULL; + } + return entry; +} +EXPORT_SYMBOL(rdma_user_mmap_entry_get); + +static void rdma_user_mmap_entry_free(struct kref *kref) +{ + struct rdma_user_mmap_entry *entry = + container_of(kref, struct rdma_user_mmap_entry, ref); + struct ib_ucontext *ucontext = entry->ucontext; + unsigned long i; + + /* + * Erase all entries occupied by this single entry, this is deferred + * until all VMA are closed so that the mmap offsets remain unique. + */ + xa_lock(&ucontext->mmap_xa); + for (i = 0; i < entry->npages; i++) + __xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i); + xa_unlock(&ucontext->mmap_xa); + + ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n", + entry->start_pgoff, entry->npages); + + if (ucontext->device->ops.mmap_free) + ucontext->device->ops.mmap_free(entry); +} + +/** + * rdma_user_mmap_entry_put() - Drop reference to the mmap entry + * + * @entry: an entry in the mmap_xa + * + * This function is called when the mapping is closed if it was + * an io mapping or when the driver is done with the entry for + * some other reason. + * Should be called after rdma_user_mmap_entry_get was called + * and entry is no longer needed. This function will erase the + * entry and free it if its refcnt reaches zero. + */ +void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry) +{ + kref_put(&entry->ref, rdma_user_mmap_entry_free); +} +EXPORT_SYMBOL(rdma_user_mmap_entry_put); + +/** + * rdma_user_mmap_entry_remove() - Drop reference to entry and + * mark it as unmmapable + * + * @entry: the entry to insert into the mmap_xa + * + * Drivers can call this to prevent userspace from creating more mappings for + * entry, however existing mmaps continue to exist and ops->mmap_free() will + * not be called until all user mmaps are destroyed. + */ +void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) +{ + if (!entry) + return; + + xa_lock(&entry->ucontext->mmap_xa); + entry->driver_removed = true; + xa_unlock(&entry->ucontext->mmap_xa); + kref_put(&entry->ref, rdma_user_mmap_entry_free); +} +EXPORT_SYMBOL(rdma_user_mmap_entry_remove); + +/** + * rdma_user_mmap_entry_insert_range() - Insert an entry to the mmap_xa + * in a given range. + * + * @ucontext: associated user context. + * @entry: the entry to insert into the mmap_xa + * @length: length of the address that will be mmapped + * @min_pgoff: minimum pgoff to be returned + * @max_pgoff: maximum pgoff to be returned + * + * This function should be called by drivers that use the rdma_user_mmap + * interface for implementing their mmap syscall A database of mmap offsets is + * handled in the core and helper functions are provided to insert entries + * into the database and extract entries when the user calls mmap with the + * given offset. The function allocates a unique page offset in a given range + * that should be provided to user, the user will use the offset to retrieve + * information such as address to be mapped and how. + * + * Return: 0 on success and -ENOMEM on failure + */ +int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, + struct rdma_user_mmap_entry *entry, + size_t length, u32 min_pgoff, + u32 max_pgoff) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + XA_STATE(xas, &ucontext->mmap_xa, min_pgoff); + u32 xa_first, xa_last, npages; + int err; + u32 i; + + if (!entry) + return -EINVAL; + + kref_init(&entry->ref); + entry->ucontext = ucontext; + + /* + * We want the whole allocation to be done without interruption from a + * different thread. The allocation requires finding a free range and + * storing. During the xa_insert the lock could be released, possibly + * allowing another thread to choose the same range. + */ + mutex_lock(&ufile->umap_lock); + + xa_lock(&ucontext->mmap_xa); + + /* We want to find an empty range */ + npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE); + entry->npages = npages; + while (true) { + /* First find an empty index */ + xas_find_marked(&xas, max_pgoff, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) + goto err_unlock; + + xa_first = xas.xa_index; + + /* Is there enough room to have the range? */ + if (check_add_overflow(xa_first, npages, &xa_last)) + goto err_unlock; + + /* + * Now look for the next present entry. If an entry doesn't + * exist, we found an empty range and can proceed. + */ + xas_next_entry(&xas, xa_last - 1); + if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last) + break; + } + + for (i = xa_first; i < xa_last; i++) { + err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL); + if (err) + goto err_undo; + } + + /* + * Internally the kernel uses a page offset, in libc this is a byte + * offset. Drivers should not return pgoff to userspace. + */ + entry->start_pgoff = xa_first; + xa_unlock(&ucontext->mmap_xa); + mutex_unlock(&ufile->umap_lock); + + ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n", + entry->start_pgoff, npages); + + return 0; + +err_undo: + for (; i > xa_first; i--) + __xa_erase(&ucontext->mmap_xa, i - 1); + +err_unlock: + xa_unlock(&ucontext->mmap_xa); + mutex_unlock(&ufile->umap_lock); + return -ENOMEM; +} +EXPORT_SYMBOL(rdma_user_mmap_entry_insert_range); + +/** + * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa. + * + * @ucontext: associated user context. + * @entry: the entry to insert into the mmap_xa + * @length: length of the address that will be mmapped + * + * This function should be called by drivers that use the rdma_user_mmap + * interface for handling user mmapped addresses. The database is handled in + * the core and helper functions are provided to insert entries into the + * database and extract entries when the user calls mmap with the given offset. + * The function allocates a unique page offset that should be provided to user, + * the user will use the offset to retrieve information such as address to + * be mapped and how. + * + * Return: 0 on success and -ENOMEM on failure + */ +int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, + struct rdma_user_mmap_entry *entry, + size_t length) +{ + return rdma_user_mmap_entry_insert_range(ucontext, entry, length, 0, + U32_MAX); +} +EXPORT_SYMBOL(rdma_user_mmap_entry_insert); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 476abc74178e..62410578dec3 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, - [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, + [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} }; static struct workqueue_struct *iwcm_wq; @@ -108,9 +109,10 @@ static struct ctl_table iwcm_ctl_table[] = { .data = &default_backlog, .maxlen = sizeof(default_backlog), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, - { } }; /* @@ -143,8 +145,8 @@ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) if (list_empty(&cm_id_priv->work_free_list)) return NULL; - work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, - free_list); + work = list_first_entry(&cm_id_priv->work_free_list, struct iwcm_work, + free_list); list_del_init(&work->free_list); return work; } @@ -158,8 +160,10 @@ static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) { struct list_head *e, *tmp; - list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) + list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) { + list_del(e); kfree(list_entry(e, struct iwcm_work, free_list)); + } } static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) @@ -204,25 +208,24 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv) /* * Release a reference on cm_id. If the last reference is being - * released, free the cm_id and return 1. + * released, free the cm_id and return 'true'. */ -static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) +static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { - BUG_ON(atomic_read(&cm_id_priv->refcount)==0); - if (atomic_dec_and_test(&cm_id_priv->refcount)) { + if (refcount_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); - return 1; + return true; } - return 0; + return false; } static void add_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); - atomic_inc(&cm_id_priv->refcount); + refcount_inc(&cm_id_priv->refcount); } static void rem_ref(struct iw_cm_id *cm_id) @@ -254,7 +257,7 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device, cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; spin_lock_init(&cm_id_priv->lock); - atomic_set(&cm_id_priv->refcount, 1); + refcount_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); INIT_LIST_HEAD(&cm_id_priv->work_list); @@ -365,12 +368,12 @@ EXPORT_SYMBOL(iw_cm_disconnect); /* * CM_ID <-- DESTROYING * - * Clean up all resources associated with the connection and release - * the initial reference taken by iw_create_cm_id. + * Clean up all resources associated with the connection. */ static void destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; + struct ib_qp *qp; unsigned long flags; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); @@ -388,19 +391,22 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; + switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* destroy the listening endpoint */ - cm_id->device->iwcm->destroy_listen(cm_id); + cm_id->device->ops.iw_destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* Abrupt close of the connection */ - (void)iwcm_modify_qp_err(cm_id_priv->qp); + (void)iwcm_modify_qp_err(qp); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_IDLE: @@ -416,7 +422,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) */ cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_id->device->iwcm->reject(cm_id, NULL, 0); + cm_id->device->ops.iw_reject(cm_id, NULL, 0); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_CONN_SENT: @@ -425,29 +431,30 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) BUG(); break; } - if (cm_id_priv->qp) { - cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); - cm_id_priv->qp = NULL; - } spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id_priv->id.device->ops.iw_rem_ref(qp); if (cm_id->mapped) { iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); } - - (void)iwcm_deref_id(cm_id_priv); } /* - * This function is only called by the application thread and cannot - * be called by the event thread. The function will wait for all - * references to be released on the cm_id and then kfree the cm_id - * object. + * Destroy cm_id. If the cm_id still has other references, wait for all + * references to be released on the cm_id and then release the initial + * reference taken by iw_create_cm_id. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); destroy_cm_id(cm_id); + if (refcount_read(&cm_id_priv->refcount) > 1) + flush_workqueue(iwcm_wq); + iwcm_deref_id(cm_id_priv); } EXPORT_SYMBOL(iw_destroy_cm_id); @@ -503,8 +510,8 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, static int iw_cm_map(struct iw_cm_id *cm_id, bool active) { const char *devname = dev_name(&cm_id->device->dev); - const char *ifname = cm_id->device->iwcm->ifname; - struct iwpm_dev_data pm_reg_msg; + const char *ifname = cm_id->device->iw_ifname; + struct iwpm_dev_data pm_reg_msg = {}; struct iwpm_sa_data pm_msg; int status; @@ -515,8 +522,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; - strncpy(pm_reg_msg.dev_name, devname, sizeof(pm_reg_msg.dev_name)); - strncpy(pm_reg_msg.if_name, ifname, sizeof(pm_reg_msg.if_name)); + strcpy(pm_reg_msg.dev_name, devname); + strcpy(pm_reg_msg.if_name, ifname); if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) || !iwpm_valid_pid()) @@ -525,6 +532,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; + pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ? + IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_IWCM); @@ -543,7 +552,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) return iwpm_create_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr, - RDMA_NL_IWCM); + RDMA_NL_IWCM, pm_msg.flags); } /* @@ -574,7 +583,8 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, false); if (!ret) - ret = cm_id->device->iwcm->create_listen(cm_id, backlog); + ret = cm_id->device->ops.iw_create_listen(cm_id, + backlog); if (ret) cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); @@ -614,7 +624,7 @@ int iw_cm_reject(struct iw_cm_id *cm_id, cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->reject(cm_id, private_data, + ret = cm_id->device->ops.iw_reject(cm_id, private_data, private_data_len); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); @@ -650,28 +660,28 @@ int iw_cm_accept(struct iw_cm_id *cm_id, return -EINVAL; } /* Get the ib_qp given the QPN */ - qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } - cm_id->device->iwcm->add_ref(qp); + cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->accept(cm_id, iw_param); + ret = cm_id->device->ops.iw_accept(cm_id, iw_param); if (ret) { /* An error on accept precludes provider events */ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id_priv->qp) { - cm_id->device->iwcm->rem_ref(qp); - cm_id_priv->qp = NULL; - } + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id->device->ops.iw_rem_ref(qp); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } @@ -692,7 +702,7 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) struct iwcm_id_private *cm_id_priv; int ret; unsigned long flags; - struct ib_qp *qp; + struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); @@ -709,30 +719,30 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) } /* Get the ib_qp given the QPN */ - qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { ret = -EINVAL; goto err; } - cm_id->device->iwcm->add_ref(qp); + cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, true); if (!ret) - ret = cm_id->device->iwcm->connect(cm_id, iw_param); + ret = cm_id->device->ops.iw_connect(cm_id, iw_param); if (!ret) return 0; /* success */ spin_lock_irqsave(&cm_id_priv->lock, flags); - if (cm_id_priv->qp) { - cm_id->device->iwcm->rem_ref(qp); - cm_id_priv->qp = NULL; - } + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; err: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id->device->ops.iw_rem_ref(qp); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; @@ -874,6 +884,7 @@ static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { + struct ib_qp *qp = NULL; unsigned long flags; int ret; @@ -892,11 +903,13 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ - cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + qp = cm_id_priv->qp; cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id_priv->id.device->ops.iw_rem_ref(qp); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); if (iw_event->private_data_len) @@ -938,21 +951,18 @@ static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, static int cm_close_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { + struct ib_qp *qp; unsigned long flags; - int ret = 0; + int ret = 0, notify_event = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; - if (cm_id_priv->qp) { - cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); - cm_id_priv->qp = NULL; - } switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_IDLE; - spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); - spin_lock_irqsave(&cm_id_priv->lock, flags); + notify_event = 1; break; case IW_CM_STATE_DESTROYING: break; @@ -961,6 +971,10 @@ static int cm_close_handler(struct iwcm_id_private *cm_id_priv, } spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id_priv->id.device->ops.iw_rem_ref(qp); + if (notify_event) + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); return ret; } @@ -1007,30 +1021,27 @@ static void cm_work_handler(struct work_struct *_work) struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; - int empty; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); - empty = list_empty(&cm_id_priv->work_list); - while (!empty) { - work = list_entry(cm_id_priv->work_list.next, - struct iwcm_work, list); + while (!list_empty(&cm_id_priv->work_list)) { + work = list_first_entry(&cm_id_priv->work_list, + struct iwcm_work, list); list_del_init(&work->list); - empty = list_empty(&cm_id_priv->work_list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { ret = process_event(cm_id_priv, &levent); - if (ret) + if (ret) { destroy_cm_id(&cm_id_priv->id); + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); + } } else pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) return; - if (empty) - return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -1082,12 +1093,9 @@ static int cm_event_handler(struct iw_cm_id *cm_id, } } - atomic_inc(&cm_id_priv->refcount); - if (list_empty(&cm_id_priv->work_list)) { - list_add_tail(&work->list, &cm_id_priv->work_list); - queue_work(iwcm_wq, &work->work); - } else - list_add_tail(&work->list, &cm_id_priv->work_list); + refcount_inc(&cm_id_priv->refcount); + list_add_tail(&work->list, &cm_id_priv->work_list); + queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; @@ -1175,29 +1183,34 @@ static int __init iw_cm_init(void) ret = iwpm_init(RDMA_NL_IWCM); if (ret) - pr_err("iw_cm: couldn't init iwpm\n"); - else - rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); - iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); + return ret; + + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) - return -ENOMEM; + goto err_alloc; iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", iwcm_ctl_table); if (!iwcm_ctl_table_hdr) { pr_err("iw_cm: couldn't register sysctl paths\n"); - destroy_workqueue(iwcm_wq); - return -ENOMEM; + goto err_sysctl; } + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); return 0; + +err_sysctl: + destroy_workqueue(iwcm_wq); +err_alloc: + iwpm_exit(RDMA_NL_IWCM); + return -ENOMEM; } static void __exit iw_cm_cleanup(void) { + rdma_nl_unregister(RDMA_NL_IWCM); unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); - rdma_nl_unregister(RDMA_NL_IWCM); iwpm_exit(RDMA_NL_IWCM); } diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h index 82c2cd1b0a80..bf74639be128 100644 --- a/drivers/infiniband/core/iwcm.h +++ b/drivers/infiniband/core/iwcm.h @@ -52,7 +52,7 @@ struct iwcm_id_private { wait_queue_head_t connect_wait; struct list_head work_list; spinlock_t lock; - atomic_t refcount; + refcount_t refcount; struct list_head work_free_list; }; diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 8861c052155a..3c9a9869212b 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -34,18 +34,25 @@ #include "iwpm_util.h" static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser"; -static int iwpm_ulib_version = 3; +u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; static int iwpm_user_pid = IWPM_PID_UNDEFINED; static atomic_t echo_nlmsg_seq; +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } -/* - * iwpm_register_pid - Send a netlink query to user space - * for the iwarp port mapper pid +/** + * iwpm_register_pid - Send a netlink query to userspace + * to get the iwarp port mapper pid + * @pm_msg: Contains driver info to send to the userspace port mapper + * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_REG_PID_SEQ] @@ -62,10 +69,6 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto pid_query_error; - } if (iwpm_check_registration(nl_client, IWPM_REG_VALID) || iwpm_user_pid == IWPM_PID_UNAVAILABLE) return 0; @@ -105,7 +108,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); - ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; @@ -116,20 +119,26 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) ret = iwpm_wait_complete_req(nlmsg_request); return ret; pid_query_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); - if (skb) - dev_kfree_skb(skb); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); + dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -/* - * iwpm_add_mapping - Send a netlink add mapping message - * to the port mapper +/** + * iwpm_add_mapping - Send a netlink add mapping request to + * the userspace port mapper + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] + * [IWPM_NLA_MANAGE_FLAGS] + * + * If the request is successful, the pm_msg stores + * the port mapper response (mapped address info) */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -140,10 +149,6 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto add_mapping_error; - } if (!iwpm_valid_pid()) return 0; if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { @@ -173,10 +178,22 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto add_mapping_error; + /* If flags are required and we're not V4, then return a quiet error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto add_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_MANAGE_FLAGS); + if (ret) + goto add_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -186,21 +203,25 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) ret = iwpm_wait_complete_req(nlmsg_request); return ret; add_mapping_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); - if (skb) - dev_kfree_skb(skb); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); +add_mapping_error_nowarn: + dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -/* - * iwpm_add_and_query_mapping - Send a netlink add and query - * mapping message to the port mapper +/** + * iwpm_add_and_query_mapping - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_QUERY_MAPPING_SEQ] * [IWPM_NLA_QUERY_LOCAL_ADDR] * [IWPM_NLA_QUERY_REMOTE_ADDR] + * [IWPM_NLA_QUERY_FLAGS] */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -211,10 +232,6 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto query_mapping_error; - } if (!iwpm_valid_pid()) return 0; if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { @@ -251,10 +268,22 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto query_mapping_error; + /* If flags are required and we're not V4, then return a quite error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto query_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_QUERY_FLAGS); + if (ret) + goto query_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -263,17 +292,21 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) ret = iwpm_wait_complete_req(nlmsg_request); return ret; query_mapping_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); - if (skb) - dev_kfree_skb(skb); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); +query_mapping_error_nowarn: + dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } -/* - * iwpm_remove_mapping - Send a netlink remove mapping message - * to the port mapper +/** + * iwpm_remove_mapping - Send a netlink remove mapping request + * to the userspace port mapper + * + * @local_addr: Local ip/tcp address to remove + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] @@ -286,10 +319,6 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) const char *err_str = ""; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - err_str = "Invalid port mapper client"; - goto remove_mapping_error; - } if (!iwpm_valid_pid()) return 0; if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) { @@ -316,7 +345,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) nlmsg_end(skb, nlh); - ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -327,7 +356,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) "remove_mapping: Local sockaddr:"); return 0; remove_mapping_error: - pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); if (skb) dev_kfree_skb_any(skb); return ret; @@ -344,9 +373,14 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_register_pid_cb - Process a port mapper response to - * iwpm_register_pid() +/** + * iwpm_register_pid_cb - Process the port mapper response to + * iwpm_register_pid query + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * If successful, the function receives the userspace port mapper pid + * which is used in future communication with the port mapper */ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -379,19 +413,22 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) /* check device name, ulib name and version */ if (strcmp(pm_msg->dev_name, dev_name) || strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { - pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", + pr_info("%s: Incorrect info (dev = %s name = %s version = %u)\n", __func__, dev_name, iwpm_name, iwpm_version); nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; goto register_pid_response_exit; } iwpm_user_pid = cb->nlh->nlmsg_pid; + iwpm_ulib_version = iwpm_version; + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", + __func__, iwpm_user_pid); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); - if (iwpm_valid_client(nl_client)) - iwpm_set_registration(nl_client, IWPM_REG_VALID); + iwpm_set_registration(nl_client, IWPM_REG_VALID); register_pid_response_exit: nlmsg_request->request_done = 1; /* always for found nlmsg_request */ @@ -403,15 +440,19 @@ register_pid_response_exit: /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { - [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } + [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RMANAGE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_mapping_cb - Process a port mapper response to - * iwpm_add_mapping() +/** + * iwpm_add_mapping_cb - Process the port mapper response to + * iwpm_add_mapping request + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -430,7 +471,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -439,9 +480,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]); mapped_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]); if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; @@ -470,19 +511,26 @@ add_mapping_response_exit: } /* netlink attribute policy for the response to add and query mapping request - * and response with remote address info */ + * and response with remote address info + */ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { - [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RQUERY_LOCAL_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_REMOTE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_and_query_mapping_cb - Process a port mapper response to - * iwpm_add_and_query_mapping() +/** + * iwpm_add_and_query_mapping_cb - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) @@ -502,7 +550,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, return -EINVAL; atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -511,9 +559,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -560,9 +608,13 @@ query_mapping_response_exit: return 0; } -/* - * iwpm_remote_info_cb - Process a port mapper message, containing - * the remote connecting peer address info +/** + * iwpm_remote_info_cb - Process remote connecting peer address info, which + * the port mapper has received from the connecting peer + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * Stores the IPv4/IPv6 address info in a hash table */ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -580,17 +632,12 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) return ret; nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); - if (!iwpm_valid_client(nl_client)) { - pr_info("%s: Invalid port mapper client = %d\n", - __func__, nl_client); - return ret; - } atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -635,8 +682,14 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } }; -/* - * iwpm_mapping_info_cb - Process a port mapper request for mapping info +/** + * iwpm_mapping_info_cb - Process a notification that the userspace + * port mapper daemon is started + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send all the local mapping + * info records to the userspace port mapper */ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -655,20 +708,20 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); if (strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { - pr_info("%s: Invalid port mapper name = %s version = %d\n", + iwpm_version < IWPM_UABI_VERSION_MIN) { + pr_info("%s: Invalid port mapper name = %s version = %u\n", __func__, iwpm_name, iwpm_version); return ret; } nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); - if (!iwpm_valid_client(nl_client)) { - pr_info("%s: Invalid port mapper client = %d\n", - __func__, nl_client); - return ret; - } iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_user_pid = cb->nlh->nlmsg_pid; + + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", + __func__, iwpm_user_pid); + if (!iwpm_mapinfo_available()) return 0; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", @@ -684,9 +737,11 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } }; -/* - * iwpm_ack_mapping_info_cb - Process a port mapper ack for - * the provided mapping info records +/** + * iwpm_ack_mapping_info_cb - Process the port mapper ack for + * the provided local mapping info records + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) */ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -712,8 +767,11 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, }; -/* - * iwpm_mapping_error_cb - Process a port mapper error message +/** + * iwpm_mapping_error_cb - Process port mapper notification for error + * + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) */ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -748,3 +806,41 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } + +/* netlink attribute policy for the received hello request */ +static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { + [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } +}; + +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_HELLO_MAX]; + const char *msg_type = "Hello request"; + u8 nl_client; + u16 abi_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb, + msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); + pr_debug("Using ABI version %u\n", iwpm_ulib_version); + iwpm_user_pid = cb->nlh->nlmsg_pid; + ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index cdb63f3f4de7..eecb369898f5 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -48,63 +48,47 @@ static DEFINE_SPINLOCK(iwpm_mapinfo_lock); static struct hlist_head *iwpm_reminfo_bucket; static DEFINE_SPINLOCK(iwpm_reminfo_lock); -static DEFINE_MUTEX(iwpm_admin_lock); static struct iwpm_admin_data iwpm_admin; +/** + * iwpm_init - Allocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes up. + */ int iwpm_init(u8 nl_client) { - int ret = 0; - mutex_lock(&iwpm_admin_lock); - if (atomic_read(&iwpm_admin.refcount) == 0) { - iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!iwpm_hash_bucket) { - ret = -ENOMEM; - goto init_exit; - } - iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!iwpm_reminfo_bucket) { - kfree(iwpm_hash_bucket); - ret = -ENOMEM; - goto init_exit; - } - } - atomic_inc(&iwpm_admin.refcount); -init_exit: - mutex_unlock(&iwpm_admin_lock); - if (!ret) { - iwpm_set_valid(nl_client, 1); - iwpm_set_registration(nl_client, IWPM_REG_UNDEF); - pr_debug("%s: Mapinfo and reminfo tables are created\n", - __func__); + iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_hash_bucket) + return -ENOMEM; + + iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_reminfo_bucket) { + kfree(iwpm_hash_bucket); + return -ENOMEM; } - return ret; + + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); + pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); + return 0; } static void free_hash_bucket(void); static void free_reminfo_bucket(void); +/** + * iwpm_exit - Deallocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes down. + */ int iwpm_exit(u8 nl_client) { - - if (!iwpm_valid_client(nl_client)) - return -EINVAL; - mutex_lock(&iwpm_admin_lock); - if (atomic_read(&iwpm_admin.refcount) == 0) { - mutex_unlock(&iwpm_admin_lock); - pr_err("%s Incorrect usage - negative refcount\n", __func__); - return -EINVAL; - } - if (atomic_dec_and_test(&iwpm_admin.refcount)) { - free_hash_bucket(); - free_reminfo_bucket(); - pr_debug("%s: Resources are destroyed\n", __func__); - } - mutex_unlock(&iwpm_admin_lock); - iwpm_set_valid(nl_client, 0); + free_hash_bucket(); + free_reminfo_bucket(); + pr_debug("%s: Resources are destroyed\n", __func__); iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } @@ -112,17 +96,23 @@ int iwpm_exit(u8 nl_client) static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); +/** + * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address + * info in a hash table + * @local_sockaddr: Local ip/tcp address + * @mapped_sockaddr: Mapped local ip/tcp address + * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags + */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, - u8 nl_client) + u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; unsigned long flags; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) - return ret; map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); if (!map_info) return -ENOMEM; @@ -132,6 +122,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; + map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { @@ -150,6 +141,15 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, return ret; } +/** + * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address + * info from the hash table + * @local_sockaddr: Local ip/tcp address + * @mapped_local_addr: Mapped local ip/tcp address + * + * Returns err code if mapping info is not found in the hash table, + * otherwise returns 0 + */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { @@ -250,6 +250,17 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } +/** + * iwpm_get_remote_info - Get the remote connecting peer address info + * + * @mapped_loc_addr: Mapped local address of the listening peer + * @mapped_rem_addr: Mapped remote address of the connecting peer + * @remote_addr: To store the remote address of the connecting peer + * @nl_client: The index of the netlink client + * + * The remote address info is retrieved and provided to the client in + * the remote_addr. After that it is removed from the hash table + */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, @@ -261,10 +272,6 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, unsigned long flags; int ret = -EINVAL; - if (!iwpm_valid_client(nl_client)) { - pr_info("%s: Invalid client = %d\n", __func__, nl_client); - return ret; - } spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( @@ -300,7 +307,7 @@ get_remote_info_exit: struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) { - struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); @@ -379,16 +386,6 @@ int iwpm_get_nlmsg_seq(void) return atomic_inc_return(&iwpm_admin.nlmsg_seq); } -int iwpm_valid_client(u8 nl_client) -{ - return iwpm_admin.client_list[nl_client]; -} - -void iwpm_set_valid(u8 nl_client, int valid) -{ - iwpm_admin.client_list[nl_client] = valid; -} - /* valid client */ u32 iwpm_get_registration(u8 nl_client) { @@ -465,14 +462,14 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, int ret; const char *err_str = ""; - ret = nlmsg_validate(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, - NULL); + ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, + nlmsg_policy, NULL); if (ret) { err_str = "Invalid attribute"; goto parse_nlmsg_error; } - ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max - 1, - nlmsg_policy, NULL); + ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, + nlmsg_policy, NULL); if (ret) { err_str = "Unable to parse the nlmsg"; goto parse_nlmsg_error; @@ -604,18 +601,17 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) nlmsg_end(skb, nlh); - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto mapinfo_num_error; } - pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num); + pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num); return 0; mapinfo_num_error: pr_info("%s: %s\n", __func__, err_str); - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); return ret; } @@ -633,7 +629,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; - ret = rdma_nl_unicast(skb, iwpm_pid); + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; @@ -686,6 +682,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &map_info->map_flags, + IWPM_NLA_MAPINFO_FLAGS); + if (ret) + goto send_mapping_info_unlock; + } + nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, @@ -729,8 +733,7 @@ send_mapping_info_unlock: send_mapping_info_exit: if (ret) { pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); - if (skb) - dev_kfree_skb(skb); + dev_kfree_skb(skb); return ret; } send_nlmsg_done(skb, nl_client, iwpm_pid); @@ -754,3 +757,37 @@ int iwpm_mapinfo_available(void) spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } + +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + const char *err_str; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto hello_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of abi_version into nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, + IWPM_NLA_HELLO_ABI_VERSION); + if (ret) + goto hello_num_error; + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto hello_num_error; + } + pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); + return 0; +hello_num_error: + pr_info("%s: %s\n", __func__, err_str); + dev_kfree_skb(skb); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index af1fc14a0d3d..d6fc8402158a 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -33,7 +33,6 @@ #ifndef _IWPM_UTIL_H #define _IWPM_UTIL_H -#include <linux/module.h> #include <linux/io.h> #include <linux/in.h> #include <linux/in6.h> @@ -78,6 +77,7 @@ struct iwpm_mapping_info { struct sockaddr_storage local_sockaddr; struct sockaddr_storage mapped_sockaddr; u8 nl_client; + u32 map_flags; }; struct iwpm_remote_info { @@ -89,9 +89,7 @@ struct iwpm_remote_info { }; struct iwpm_admin_data { - atomic_t refcount; atomic_t nlmsg_seq; - int client_list[RDMA_NL_NUM_CLIENTS]; u32 reg_list[RDMA_NL_NUM_CLIENTS]; }; @@ -140,29 +138,13 @@ int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request); int iwpm_get_nlmsg_seq(void); /** - * iwpm_add_reminfo - Add remote address info of the connecting peer + * iwpm_add_remote_info - Add remote address info of the connecting peer * to the remote info hash table * @reminfo: The remote info to be added */ void iwpm_add_remote_info(struct iwpm_remote_info *reminfo); /** - * iwpm_valid_client - Check if the port mapper client is valid - * @nl_client: The index of the netlink client - * - * Valid clients need to call iwpm_init() before using - * the port mapper - */ -int iwpm_valid_client(u8 nl_client); - -/** - * iwpm_set_valid - Set the port mapper client to valid or not - * @nl_client: The index of the netlink client - * @valid: 1 if valid or 0 if invalid - */ -void iwpm_set_valid(u8 nl_client, int valid); - -/** * iwpm_check_registration - Check if the client registration * matches the given one * @nl_client: The index of the netlink client @@ -182,7 +164,7 @@ u32 iwpm_check_registration(u8 nl_client, u32 reg); void iwpm_set_registration(u8 nl_client, u32 reg); /** - * iwpm_get_registration + * iwpm_get_registration - Get the client registration * @nl_client: The index of the netlink client * * Returns the client registration type @@ -209,8 +191,10 @@ int iwpm_mapinfo_available(void); /** * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * @a_sockaddr: first sockaddr to compare + * @b_sockaddr: second sockaddr to compare * - * Returns 0 if they are holding the same ip/tcp address info, + * Return: 0 if they are holding the same ip/tcp address info, * otherwise returns 1 */ int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, @@ -266,4 +250,16 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, * @msg: Message to print */ void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); + +/** + * iwpm_send_hello - Send hello response to iwpmd + * + * @nl_client: The index of the netlink client + * @iwpm_pid: The pid of the user space port mapper + * @abi_version: The kernel's abi_version + * + * Returns 0 on success or a negative error code + */ +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version); +extern u16 iwpm_ulib_version; #endif diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c new file mode 100644 index 000000000000..8fd80adfe833 --- /dev/null +++ b/drivers/infiniband/core/lag.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + */ + +#include <rdma/ib_verbs.h> +#include <rdma/ib_cache.h> +#include <rdma/lag.h> + +static struct sk_buff *rdma_build_skb(struct net_device *netdev, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct ipv6hdr *ip6h; + struct sk_buff *skb; + struct ethhdr *eth; + struct iphdr *iph; + struct udphdr *uh; + u8 smac[ETH_ALEN]; + bool is_ipv4; + int hdr_len; + + is_ipv4 = ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw); + hdr_len = ETH_HLEN + sizeof(struct udphdr) + LL_RESERVED_SPACE(netdev); + hdr_len += is_ipv4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr); + + skb = alloc_skb(hdr_len, flags); + if (!skb) + return NULL; + + skb->dev = netdev; + skb_reserve(skb, hdr_len); + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + uh->source = + htons(rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label)); + uh->dest = htons(ROCE_V2_UDP_DPORT); + uh->len = htons(sizeof(struct udphdr)); + + if (is_ipv4) { + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + iph->frag_off = 0; + iph->version = 4; + iph->protocol = IPPROTO_UDP; + iph->ihl = 0x5; + iph->tot_len = htons(sizeof(struct udphdr) + sizeof(struct + iphdr)); + memcpy(&iph->saddr, ah_attr->grh.sgid_attr->gid.raw + 12, + sizeof(struct in_addr)); + memcpy(&iph->daddr, ah_attr->grh.dgid.raw + 12, + sizeof(struct in_addr)); + } else { + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + memcpy(&ip6h->flow_lbl, &ah_attr->grh.flow_label, + sizeof(*ip6h->flow_lbl)); + memcpy(&ip6h->saddr, ah_attr->grh.sgid_attr->gid.raw, + sizeof(struct in6_addr)); + memcpy(&ip6h->daddr, ah_attr->grh.dgid.raw, + sizeof(struct in6_addr)); + } + + skb_push(skb, sizeof(struct ethhdr)); + skb_reset_mac_header(skb); + eth = eth_hdr(skb); + skb->protocol = eth->h_proto = htons(is_ipv4 ? ETH_P_IP : ETH_P_IPV6); + rdma_read_gid_l2_fields(ah_attr->grh.sgid_attr, NULL, smac); + memcpy(eth->h_source, smac, ETH_ALEN); + memcpy(eth->h_dest, ah_attr->roce.dmac, ETH_ALEN); + + return skb; +} + +static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device, + struct net_device *master, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave; + struct sk_buff *skb; + + skb = rdma_build_skb(master, ah_attr, flags); + if (!skb) + return ERR_PTR(-ENOMEM); + + rcu_read_lock(); + slave = netdev_get_xmit_slave(master, skb, + !!(device->lag_flags & + RDMA_LAG_FLAGS_HASH_ALL_SLAVES)); + dev_hold(slave); + rcu_read_unlock(); + kfree_skb(skb); + return slave; +} + +void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave) +{ + dev_put(xmit_slave); +} + +struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave = NULL; + struct net_device *master; + + if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE && + ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + ah_attr->grh.flow_label)) + return NULL; + + rcu_read_lock(); + master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr); + if (IS_ERR(master)) { + rcu_read_unlock(); + return master; + } + dev_hold(master); + rcu_read_unlock(); + + if (!netif_is_bond_master(master)) + goto put; + + slave = rdma_get_xmit_slave_udp(device, master, ah_attr, flags); +put: + dev_put(master); + return slave; +} diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 7870823bac47..8f26bfb69586 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3,7 +3,7 @@ * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014,2018 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,10 +38,10 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/dma-mapping.h> -#include <linux/idr.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/security.h> +#include <linux/xarray.h> #include <rdma/ib_cache.h> #include "mad_priv.h" @@ -51,6 +51,27 @@ #include "opa_smi.h" #include "agent.h" +#define CREATE_TRACE_POINTS +#include <trace/events/ib_mad.h> + +#ifdef CONFIG_TRACEPOINTS +static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_qp_info *qp_info, + struct trace_event_raw_ib_mad_send_template *entry) +{ + struct ib_ud_wr *wr = &mad_send_wr->send_wr; + struct rdma_ah_attr attr = {}; + + rdma_query_ah(wr->ah, &attr); + + /* These are common */ + entry->sl = attr.sl; + entry->rqpn = wr->remote_qpn; + entry->rqkey = wr->remote_qkey; + entry->dlid = rdma_ah_get_dlid(&attr); +} +#endif + static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -59,12 +80,8 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -/* - * The mlx4 driver uses the top byte to distinguish which virtual function - * generated the MAD, so we must avoid using it. - */ -#define AGENT_ID_LIMIT (1 << 24) -static DEFINE_IDR(ib_mad_clients); +static DEFINE_XARRAY_ALLOC1(ib_mad_clients); +static u32 ib_mad_client_next; static struct list_head ib_mad_port_list; /* Port list lock */ @@ -96,7 +113,7 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); * Assumes ib_mad_port_list_lock is being held */ static inline struct ib_mad_port_private * -__ib_get_mad_port(struct ib_device *device, int port_num) +__ib_get_mad_port(struct ib_device *device, u32 port_num) { struct ib_mad_port_private *entry; @@ -112,7 +129,7 @@ __ib_get_mad_port(struct ib_device *device, int port_num) * for a device/port */ static inline struct ib_mad_port_private * -ib_get_mad_port(struct ib_device *device, int port_num) +ib_get_mad_port(struct ib_device *device, u32 port_num) { struct ib_mad_port_private *entry; unsigned long flags; @@ -133,8 +150,7 @@ static inline u8 convert_mgmt_class(u8 mgmt_class) static int get_spl_qp_index(enum ib_qp_type qp_type) { - switch (qp_type) - { + switch (qp_type) { case IB_QPT_SMI: return 0; case IB_QPT_GSI: @@ -194,13 +210,36 @@ int ib_response_mad(const struct ib_mad_hdr *hdr) } EXPORT_SYMBOL(ib_response_mad); +#define SOL_FC_MAX_DEFAULT_FRAC 4 +#define SOL_FC_MAX_SA_FRAC 32 + +static int get_sol_fc_max_outstanding(struct ib_mad_reg_req *mad_reg_req) +{ + if (!mad_reg_req) + /* Send only agent */ + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; + + switch (mad_reg_req->mgmt_class) { + case IB_MGMT_CLASS_CM: + return mad_recvq_size / SOL_FC_MAX_DEFAULT_FRAC; + case IB_MGMT_CLASS_SUBN_ADM: + return mad_recvq_size / SOL_FC_MAX_SA_FRAC; + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + return min(mad_recvq_size, IB_MAD_QP_RECV_SIZE) / + SOL_FC_MAX_DEFAULT_FRAC; + default: + return 0; + } +} + /* * ib_register_mad_agent - Register to send/receive MADs * * Context: Process context. */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, - u8 port_num, + u32 port_num, enum ib_qp_type qp_type, struct ib_mad_reg_req *mad_reg_req, u8 rmpp_version, @@ -330,7 +369,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { - dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n", + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %u\n", __func__, port_num); ret = ERR_PTR(-ENODEV); goto error1; @@ -375,13 +414,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); - INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); + INIT_LIST_HEAD(&mad_agent_priv->backlog_list); INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); - atomic_set(&mad_agent_priv->refcount, 1); + refcount_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); + mad_agent_priv->sol_fc_send_count = 0; + mad_agent_priv->sol_fc_wait_count = 0; + mad_agent_priv->sol_fc_max = + recv_handler ? get_sol_fc_max_outstanding(mad_reg_req) : 0; ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type); if (ret2) { @@ -389,18 +432,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error4; } - idr_preload(GFP_KERNEL); - idr_lock(&ib_mad_clients); - ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0, - AGENT_ID_LIMIT, GFP_ATOMIC); - idr_unlock(&ib_mad_clients); - idr_preload_end(); - + /* + * The mlx4 driver uses the top byte to distinguish which virtual + * function generated the MAD, so we must avoid using it. + */ + ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid, + mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1), + &ib_mad_client_next, GFP_KERNEL); if (ret2 < 0) { ret = ERR_PTR(ret2); goto error5; } - mad_agent_priv->agent.hi_tid = ret2; /* * Make sure MAD registration (if supplied) @@ -445,12 +487,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, } spin_unlock_irq(&port_priv->reg_lock); + trace_ib_mad_create_agent(mad_agent_priv); return &mad_agent_priv->agent; error6: spin_unlock_irq(&port_priv->reg_lock); - idr_lock(&ib_mad_clients); - idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); - idr_unlock(&ib_mad_clients); + xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); error5: ib_mad_agent_security_cleanup(&mad_agent_priv->agent); error4: @@ -462,146 +503,18 @@ error1: } EXPORT_SYMBOL(ib_register_mad_agent); -static inline int is_snooping_sends(int mad_snoop_flags) -{ - return (mad_snoop_flags & - (/*IB_MAD_SNOOP_POSTED_SENDS | - IB_MAD_SNOOP_RMPP_SENDS |*/ - IB_MAD_SNOOP_SEND_COMPLETIONS /*| - IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/)); -} - -static inline int is_snooping_recvs(int mad_snoop_flags) -{ - return (mad_snoop_flags & - (IB_MAD_SNOOP_RECVS /*| - IB_MAD_SNOOP_RMPP_RECVS*/)); -} - -static int register_snoop_agent(struct ib_mad_qp_info *qp_info, - struct ib_mad_snoop_private *mad_snoop_priv) -{ - struct ib_mad_snoop_private **new_snoop_table; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - /* Check for empty slot in array. */ - for (i = 0; i < qp_info->snoop_table_size; i++) - if (!qp_info->snoop_table[i]) - break; - - if (i == qp_info->snoop_table_size) { - /* Grow table. */ - new_snoop_table = krealloc(qp_info->snoop_table, - sizeof mad_snoop_priv * - (qp_info->snoop_table_size + 1), - GFP_ATOMIC); - if (!new_snoop_table) { - i = -ENOMEM; - goto out; - } - - qp_info->snoop_table = new_snoop_table; - qp_info->snoop_table_size++; - } - qp_info->snoop_table[i] = mad_snoop_priv; - atomic_inc(&qp_info->snoop_count); -out: - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - return i; -} - -struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, - u8 port_num, - enum ib_qp_type qp_type, - int mad_snoop_flags, - ib_mad_snoop_handler snoop_handler, - ib_mad_recv_handler recv_handler, - void *context) -{ - struct ib_mad_port_private *port_priv; - struct ib_mad_agent *ret; - struct ib_mad_snoop_private *mad_snoop_priv; - int qpn; - int err; - - /* Validate parameters */ - if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || - (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) { - ret = ERR_PTR(-EINVAL); - goto error1; - } - qpn = get_spl_qp_index(qp_type); - if (qpn == -1) { - ret = ERR_PTR(-EINVAL); - goto error1; - } - port_priv = ib_get_mad_port(device, port_num); - if (!port_priv) { - ret = ERR_PTR(-ENODEV); - goto error1; - } - /* Allocate structures */ - mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL); - if (!mad_snoop_priv) { - ret = ERR_PTR(-ENOMEM); - goto error1; - } - - /* Now, fill in the various structures */ - mad_snoop_priv->qp_info = &port_priv->qp_info[qpn]; - mad_snoop_priv->agent.device = device; - mad_snoop_priv->agent.recv_handler = recv_handler; - mad_snoop_priv->agent.snoop_handler = snoop_handler; - mad_snoop_priv->agent.context = context; - mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp; - mad_snoop_priv->agent.port_num = port_num; - mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; - init_completion(&mad_snoop_priv->comp); - - err = ib_mad_agent_security_setup(&mad_snoop_priv->agent, qp_type); - if (err) { - ret = ERR_PTR(err); - goto error2; - } - - mad_snoop_priv->snoop_index = register_snoop_agent( - &port_priv->qp_info[qpn], - mad_snoop_priv); - if (mad_snoop_priv->snoop_index < 0) { - ret = ERR_PTR(mad_snoop_priv->snoop_index); - goto error3; - } - - atomic_set(&mad_snoop_priv->refcount, 1); - return &mad_snoop_priv->agent; -error3: - ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); -error2: - kfree(mad_snoop_priv); -error1: - return ret; -} -EXPORT_SYMBOL(ib_register_mad_snoop); - static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { - if (atomic_dec_and_test(&mad_agent_priv->refcount)) + if (refcount_dec_and_test(&mad_agent_priv->refcount)) complete(&mad_agent_priv->comp); } -static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv) -{ - if (atomic_dec_and_test(&mad_snoop_priv->refcount)) - complete(&mad_snoop_priv->comp); -} - static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_port_private *port_priv; /* Note that we could still be handling received MADs */ + trace_ib_mad_unregister_agent(mad_agent_priv); /* * Canceling all sends results in dropping received response @@ -614,15 +527,13 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) spin_lock_irq(&port_priv->reg_lock); remove_mad_reg_req(mad_agent_priv); spin_unlock_irq(&port_priv->reg_lock); - idr_lock(&ib_mad_clients); - idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); - idr_unlock(&ib_mad_clients); + xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); flush_workqueue(port_priv->wq); - ib_cancel_rmpp_recvs(mad_agent_priv); deref_mad_agent(mad_agent_priv); wait_for_completion(&mad_agent_priv->comp); + ib_cancel_rmpp_recvs(mad_agent_priv); ib_mad_agent_security_cleanup(&mad_agent_priv->agent); @@ -630,25 +541,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) kfree_rcu(mad_agent_priv, rcu); } -static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) -{ - struct ib_mad_qp_info *qp_info; - unsigned long flags; - - qp_info = mad_snoop_priv->qp_info; - spin_lock_irqsave(&qp_info->snoop_lock, flags); - qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL; - atomic_dec(&qp_info->snoop_count); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - - deref_snoop_agent(mad_snoop_priv); - wait_for_completion(&mad_snoop_priv->comp); - - ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); - - kfree(mad_snoop_priv); -} - /* * ib_unregister_mad_agent - Unregisters a client from using MAD services * @@ -657,20 +549,11 @@ static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_snoop_private *mad_snoop_priv; - - /* If the TID is zero, the agent can only snoop. */ - if (mad_agent->hi_tid) { - mad_agent_priv = container_of(mad_agent, - struct ib_mad_agent_private, - agent); - unregister_mad_agent(mad_agent_priv); - } else { - mad_snoop_priv = container_of(mad_agent, - struct ib_mad_snoop_private, - agent); - unregister_mad_snoop(mad_snoop_priv); - } + + mad_agent_priv = container_of(mad_agent, + struct ib_mad_agent_private, + agent); + unregister_mad_agent(mad_agent_priv); } EXPORT_SYMBOL(ib_unregister_mad_agent); @@ -686,59 +569,8 @@ static void dequeue_mad(struct ib_mad_list_head *mad_list) spin_unlock_irqrestore(&mad_queue->lock, flags); } -static void snoop_send(struct ib_mad_qp_info *qp_info, - struct ib_mad_send_buf *send_buf, - struct ib_mad_send_wc *mad_send_wc, - int mad_snoop_flags) -{ - struct ib_mad_snoop_private *mad_snoop_priv; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - for (i = 0; i < qp_info->snoop_table_size; i++) { - mad_snoop_priv = qp_info->snoop_table[i]; - if (!mad_snoop_priv || - !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) - continue; - - atomic_inc(&mad_snoop_priv->refcount); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent, - send_buf, mad_send_wc); - deref_snoop_agent(mad_snoop_priv); - spin_lock_irqsave(&qp_info->snoop_lock, flags); - } - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); -} - -static void snoop_recv(struct ib_mad_qp_info *qp_info, - struct ib_mad_recv_wc *mad_recv_wc, - int mad_snoop_flags) -{ - struct ib_mad_snoop_private *mad_snoop_priv; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - for (i = 0; i < qp_info->snoop_table_size; i++) { - mad_snoop_priv = qp_info->snoop_table[i]; - if (!mad_snoop_priv || - !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) - continue; - - atomic_inc(&mad_snoop_priv->refcount); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, - mad_recv_wc); - deref_snoop_agent(mad_snoop_priv); - spin_lock_irqsave(&qp_info->snoop_lock, flags); - } - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); -} - static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, - u16 pkey_index, u8 port_num, struct ib_wc *wc) + u16 pkey_index, u32 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); wc->wr_cqe = cqe; @@ -797,7 +629,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_port_private *port_priv; struct ib_mad_agent_private *recv_mad_agent = NULL; struct ib_device *device = mad_agent_priv->agent.device; - u8 port_num; + u32 port_num; struct ib_wc mad_wc; struct ib_ud_wr *send_wr = &mad_send_wr->send_wr; size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv); @@ -821,6 +653,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, if (opa && smp->class_version == OPA_SM_CLASS_VERSION) { u32 opa_drslid; + trace_ib_mad_handle_out_opa_smi(opa_smp); + if ((opa_get_smp_direction(opa_smp) ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == OPA_LID_PERMISSIVE && @@ -846,6 +680,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) goto out; } else { + trace_ib_mad_handle_out_ib_smi(smp); + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == IB_LID_PERMISSIVE && smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == @@ -889,11 +725,10 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, /* No GRH for DR SMP */ ret = device->ops.process_mad(device, 0, port_num, &mad_wc, NULL, - (const struct ib_mad_hdr *)smp, mad_size, - (struct ib_mad_hdr *)mad_priv->mad, - &mad_size, &out_mad_pkey_index); - switch (ret) - { + (const struct ib_mad *)smp, + (struct ib_mad *)mad_priv->mad, &mad_size, + &out_mad_pkey_index); + switch (ret) { case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) && mad_agent_priv->agent.recv_handler) { @@ -903,7 +738,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, * Reference MAD agent until receive * side of local completion handled */ - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); } else kfree(mad_priv); break; @@ -943,7 +778,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, local->return_wc_byte_len = mad_size; } /* Reference MAD agent until send side of local completion handled */ - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); /* Queue local completion to local list */ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_add_tail(&local->completion_list, &mad_agent_priv->local_list); @@ -992,7 +827,7 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, /* Allocate data segments. */ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { - seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); + seg = kmalloc(sizeof(*seg) + seg_size, gfp_mask); if (!seg) { free_send_rmpp_list(send_wr); return -ENOMEM; @@ -1022,12 +857,11 @@ int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent) } EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); -struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, - u32 remote_qpn, u16 pkey_index, - int rmpp_active, - int hdr_len, int data_len, - gfp_t gfp_mask, - u8 base_version) +struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent, + u32 remote_qpn, u16 pkey_index, + int rmpp_active, int hdr_len, + int data_len, gfp_t gfp_mask, + u8 base_version) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; @@ -1101,7 +935,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, } mad_send_wr->send_buf.mad_agent = mad_agent; - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); return &mad_send_wr->send_buf; } EXPORT_SYMBOL(ib_create_send_mad); @@ -1223,6 +1057,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { + trace_ib_mad_ib_send_mad(mad_send_wr, qp_info); ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, NULL); list = &qp_info->send_queue.list; @@ -1247,6 +1082,180 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) return ret; } +static void handle_queued_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) { + mad_agent_priv->sol_fc_wait_count--; + list_move_tail(&mad_send_wr->agent_list, + &mad_agent_priv->backlog_list); + } else { + expect_mad_state(mad_send_wr, IB_MAD_STATE_INIT); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->backlog_list); + } +} + +static void handle_send_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->state == IB_MAD_STATE_INIT) { + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + } else { + expect_mad_state2(mad_send_wr, IB_MAD_STATE_WAIT_RESP, + IB_MAD_STATE_QUEUED); + list_move_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + } + + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + mad_agent_priv->sol_fc_send_count++; + } +} + +static void handle_wait_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *temp_mad_send_wr; + struct list_head *list_item; + unsigned long delay; + + expect_mad_state3(mad_send_wr, IB_MAD_STATE_SEND_START, + IB_MAD_STATE_WAIT_RESP, IB_MAD_STATE_CANCELED); + if (mad_send_wr->state == IB_MAD_STATE_SEND_START && + mad_send_wr->is_solicited_fc) { + mad_agent_priv->sol_fc_send_count--; + mad_agent_priv->sol_fc_wait_count++; + } + + list_del_init(&mad_send_wr->agent_list); + delay = mad_send_wr->timeout; + mad_send_wr->timeout += jiffies; + + if (delay) { + list_for_each_prev(list_item, + &mad_agent_priv->wait_list) { + temp_mad_send_wr = list_entry( + list_item, + struct ib_mad_send_wr_private, + agent_list); + if (time_after(mad_send_wr->timeout, + temp_mad_send_wr->timeout)) + break; + } + } else { + list_item = &mad_agent_priv->wait_list; + } + + list_add(&mad_send_wr->agent_list, list_item); +} + +static void handle_early_resp_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + expect_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + mad_agent_priv->sol_fc_send_count -= mad_send_wr->is_solicited_fc; +} + +static void handle_canceled_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + not_expect_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) + mad_agent_priv->sol_fc_send_count--; + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + } +} + +static void handle_done_state(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_agent_private *mad_agent_priv) +{ + if (mad_send_wr->is_solicited_fc) { + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) + mad_agent_priv->sol_fc_send_count--; + else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) + mad_agent_priv->sol_fc_wait_count--; + } + + list_del_init(&mad_send_wr->agent_list); +} + +void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state new_state) +{ + struct ib_mad_agent_private *mad_agent_priv = + mad_send_wr->mad_agent_priv; + + switch (new_state) { + case IB_MAD_STATE_INIT: + break; + case IB_MAD_STATE_QUEUED: + handle_queued_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_SEND_START: + handle_send_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_WAIT_RESP: + handle_wait_state(mad_send_wr, mad_agent_priv); + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + return; + break; + case IB_MAD_STATE_EARLY_RESP: + handle_early_resp_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_CANCELED: + handle_canceled_state(mad_send_wr, mad_agent_priv); + break; + case IB_MAD_STATE_DONE: + handle_done_state(mad_send_wr, mad_agent_priv); + break; + } + + mad_send_wr->state = new_state; +} + +static bool is_solicited_fc_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + u8 mgmt_class; + + if (!mad_send_wr->timeout) + return 0; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (mad_send_wr->mad_agent_priv->agent.rmpp_version && + (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) + return 0; + + mgmt_class = + ((struct ib_mad_hdr *)mad_send_wr->send_buf.mad)->mgmt_class; + return mgmt_class == IB_MGMT_CLASS_CM || + mgmt_class == IB_MGMT_CLASS_SUBN_ADM || + mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || + mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE; +} + +static bool mad_is_for_backlog(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_agent_private *mad_agent_priv = + mad_send_wr->mad_agent_priv; + + if (!mad_send_wr->is_solicited_fc || !mad_agent_priv->sol_fc_max) + return false; + + if (!list_empty(&mad_agent_priv->backlog_list)) + return true; + + return mad_agent_priv->sol_fc_send_count + + mad_agent_priv->sol_fc_wait_count >= + mad_agent_priv->sol_fc_max; +} + /* * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client @@ -1272,9 +1281,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, if (ret) goto error; - if (!send_buf->mad_agent->send_handler || - (send_buf->timeout_ms && - !send_buf->mad_agent->recv_handler)) { + if (!send_buf->mad_agent->send_handler) { ret = -EINVAL; goto error; } @@ -1310,15 +1317,19 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, mad_send_wr->max_retries = send_buf->retries; mad_send_wr->retries_left = send_buf->retries; send_buf->retries = 0; - /* Reference for work request to QP + response */ - mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); - mad_send_wr->status = IB_WC_SUCCESS; + change_mad_state(mad_send_wr, IB_MAD_STATE_INIT); /* Reference MAD agent until send completes */ - atomic_inc(&mad_agent_priv->refcount); + refcount_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_add_tail(&mad_send_wr->agent_list, - &mad_agent_priv->send_list); + mad_send_wr->is_solicited_fc = is_solicited_fc_mad(mad_send_wr); + if (mad_is_for_backlog(mad_send_wr)) { + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + return 0; + } + + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { @@ -1330,9 +1341,9 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, if (ret < 0) { /* Fail send request */ spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&mad_send_wr->agent_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); goto error; } } @@ -1372,25 +1383,6 @@ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc) } EXPORT_SYMBOL(ib_free_recv_mad); -struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, - u8 rmpp_version, - ib_mad_send_handler send_handler, - ib_mad_recv_handler recv_handler, - void *context) -{ - return ERR_PTR(-EINVAL); /* XXX: for now */ -} -EXPORT_SYMBOL(ib_redirect_mad_qp); - -int ib_process_mad_wc(struct ib_mad_agent *mad_agent, - struct ib_wc *wc) -{ - dev_err(&mad_agent->device->dev, - "ib_process_mad_wc() not implemented yet\n"); - return 0; -} -EXPORT_SYMBOL(ib_process_mad_wc); - static int method_in_use(struct ib_mad_mgmt_method_table **method, struct ib_mad_reg_req *mad_reg_req) { @@ -1478,11 +1470,9 @@ static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method, int i; /* Remove any methods for this mad agent */ - for (i = 0; i < IB_MGMT_MAX_METHODS; i++) { - if (method->agent[i] == agent) { + for (i = 0; i < IB_MGMT_MAX_METHODS; i++) + if (method->agent[i] == agent) method->agent[i] = NULL; - } - } } static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, @@ -1657,9 +1647,8 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) * Was MAD registration request supplied * with original registration ? */ - if (!agent_priv->reg_req) { + if (!agent_priv->reg_req) goto out; - } port_priv = agent_priv->qp_info->port_priv; mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class); @@ -1756,8 +1745,8 @@ find_mad_agent(struct ib_mad_port_private *port_priv, */ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; rcu_read_lock(); - mad_agent = idr_find(&ib_mad_clients, hi_tid); - if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount)) + mad_agent = xa_load(&ib_mad_clients, hi_tid); + if (mad_agent && !refcount_inc_not_zero(&mad_agent->refcount)) mad_agent = NULL; rcu_read_unlock(); } else { @@ -1809,14 +1798,14 @@ find_mad_agent(struct ib_mad_port_private *port_priv, } } if (mad_agent) - atomic_inc(&mad_agent->refcount); + refcount_inc(&mad_agent->refcount); out: spin_unlock_irqrestore(&port_priv->reg_lock, flags); } if (mad_agent && !mad_agent->agent.recv_handler) { dev_notice(&port_priv->device->dev, - "No receive handler for client %p on port %d\n", + "No receive handler for client %p on port %u\n", &mad_agent->agent, port_priv->port_num); deref_mad_agent(mad_agent); mad_agent = NULL; @@ -1835,7 +1824,7 @@ static int validate_mad(const struct ib_mad_hdr *mad_hdr, /* Make sure MAD base version is understood */ if (mad_hdr->base_version != IB_MGMT_BASE_VERSION && (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) { - pr_err("MAD received with unsupported base version %d %s\n", + pr_err("MAD received with unsupported base version %u %s\n", mad_hdr->base_version, opa ? "(opa)" : ""); goto out; } @@ -1880,15 +1869,16 @@ static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr, rwc->recv_buf.mad->mad_hdr.mgmt_class; } -static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, - const struct ib_mad_send_wr_private *wr, - const struct ib_mad_recv_wc *rwc ) +static inline int +rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc) { struct rdma_ah_attr attr; u8 send_resp, rcv_resp; union ib_gid sgid; struct ib_device *device = mad_agent_priv->agent.device; - u8 port_num = mad_agent_priv->agent.port_num; + u32 port_num = mad_agent_priv->agent.port_num; u8 lmc; bool has_grh; @@ -1959,7 +1949,19 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, */ (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) - return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; + } + + list_for_each_entry(wr, &mad_agent_priv->backlog_list, agent_list) { + if ((wr->tid == mad_hdr->tid) && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(mad_hdr->mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; } /* @@ -1978,17 +1980,55 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, (is_direct(mad_hdr->mgmt_class) || rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ - return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + return (wr->state != IB_MAD_STATE_CANCELED) ? wr : NULL; } return NULL; } +static void +process_backlog_mads(struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc mad_send_wc = {}; + unsigned long flags; + int ret; + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + while (!list_empty(&mad_agent_priv->backlog_list) && + (mad_agent_priv->sol_fc_send_count + + mad_agent_priv->sol_fc_wait_count < + mad_agent_priv->sol_fc_max)) { + mad_send_wr = list_entry(mad_agent_priv->backlog_list.next, + struct ib_mad_send_wr_private, + agent_list); + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + ret = ib_send_mad(mad_send_wr); + if (ret) { + spin_lock_irqsave(&mad_agent_priv->lock, flags); + deref_mad_agent(mad_agent_priv); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_send_wc.status = IB_WC_LOC_QP_OP_ERR; + mad_agent_priv->agent.send_handler( + &mad_agent_priv->agent, &mad_send_wc); + } + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + } + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; - if (mad_send_wr->refcount == 1) - list_move_tail(&mad_send_wr->agent_list, - &mad_send_wr->mad_agent_priv->done_list); + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP || + mad_send_wr->state == IB_MAD_STATE_QUEUED) + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + else + change_mad_state(mad_send_wr, IB_MAD_STATE_EARLY_RESP); } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, @@ -1997,6 +2037,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; + bool is_mad_done; int ret; INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); @@ -2034,16 +2075,18 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_agent_priv->agent.recv_handler( &mad_agent_priv->agent, NULL, mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); } else { /* not user rmpp, revert to normal behavior and - * drop the mad */ + * drop the mad + */ ib_free_recv_mad(mad_recv_wc); deref_mad_agent(mad_agent_priv); return; } } else { ib_mark_mad_done(mad_send_wr); + is_mad_done = (mad_send_wr->state == IB_MAD_STATE_DONE); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ @@ -2051,32 +2094,35 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, &mad_agent_priv->agent, &mad_send_wr->send_buf, mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); - mad_send_wc.status = IB_WC_SUCCESS; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + if (is_mad_done) { + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, + &mad_send_wc); + } } } else { mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, mad_recv_wc); deref_mad_agent(mad_agent_priv); } - - return; } static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, const struct ib_mad_qp_info *qp_info, const struct ib_wc *wc, - int port_num, + u32 port_num, struct ib_mad_private *recv, struct ib_mad_private *response) { enum smi_forward_action retsmi; struct ib_smp *smp = (struct ib_smp *)recv->mad; + trace_ib_mad_handle_ib_smi(smp); + if (smi_handle_dr_smp_recv(smp, rdma_cap_ib_switch(port_priv->device), port_num, @@ -2155,13 +2201,15 @@ static enum smi_action handle_opa_smi(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info, struct ib_wc *wc, - int port_num, + u32 port_num, struct ib_mad_private *recv, struct ib_mad_private *response) { enum smi_forward_action retsmi; struct opa_smp *smp = (struct opa_smp *)recv->mad; + trace_ib_mad_handle_opa_smi(smp); + if (opa_smi_handle_dr_smp_recv(smp, rdma_cap_ib_switch(port_priv->device), port_num, @@ -2209,7 +2257,7 @@ static enum smi_action handle_smi(struct ib_mad_port_private *port_priv, struct ib_mad_qp_info *qp_info, struct ib_wc *wc, - int port_num, + u32 port_num, struct ib_mad_private *recv, struct ib_mad_private *response, bool opa) @@ -2233,7 +2281,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; struct ib_mad_agent_private *mad_agent; - int port_num; + u32 port_num; int ret = IB_MAD_RESULT_SUCCESS; size_t mad_size; u16 resp_mad_pkey_index = 0; @@ -2279,13 +2327,13 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; - if (atomic_read(&qp_info->snoop_count)) - snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); - /* Validate MAD */ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; + trace_ib_mad_recv_done_handler(qp_info, wc, + (struct ib_mad_hdr *)recv->mad); + mad_size = recv->mad_size; response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) @@ -2308,9 +2356,9 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) if (port_priv->device->ops.process_mad) { ret = port_priv->device->ops.process_mad( port_priv->device, 0, port_priv->port_num, wc, - &recv->grh, (const struct ib_mad_hdr *)recv->mad, - recv->mad_size, (struct ib_mad_hdr *)response->mad, - &mad_size, &resp_mad_pkey_index); + &recv->grh, (const struct ib_mad *)recv->mad, + (struct ib_mad *)response->mad, &mad_size, + &resp_mad_pkey_index); if (opa) wc->pkey_index = resp_mad_pkey_index; @@ -2332,6 +2380,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); if (mad_agent) { + trace_ib_mad_recv_done_agent(mad_agent); ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); /* * recv is freed up in error cases in ib_mad_complete_recv @@ -2381,29 +2430,11 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wr_private *temp_mad_send_wr; - struct list_head *list_item; unsigned long delay; mad_agent_priv = mad_send_wr->mad_agent_priv; - list_del(&mad_send_wr->agent_list); - delay = mad_send_wr->timeout; - mad_send_wr->timeout += jiffies; - - if (delay) { - list_for_each_prev(list_item, &mad_agent_priv->wait_list) { - temp_mad_send_wr = list_entry(list_item, - struct ib_mad_send_wr_private, - agent_list); - if (time_after(mad_send_wr->timeout, - temp_mad_send_wr->timeout)) - break; - } - } - else - list_item = &mad_agent_priv->wait_list; - list_add(&mad_send_wr->agent_list, list_item); + change_mad_state(mad_send_wr, IB_MAD_STATE_WAIT_RESP); /* Reschedule a work item if we have a shorter timeout */ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) @@ -2437,32 +2468,28 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, } else ret = IB_RMPP_RESULT_UNHANDLED; - if (mad_send_wc->status != IB_WC_SUCCESS && - mad_send_wr->status == IB_WC_SUCCESS) { - mad_send_wr->status = mad_send_wc->status; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } - - if (--mad_send_wr->refcount > 0) { - if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && - mad_send_wr->status == IB_WC_SUCCESS) { - wait_for_response(mad_send_wr); - } + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + mad_send_wc->status = IB_WC_WR_FLUSH_ERR; + else if (mad_send_wr->state == IB_MAD_STATE_SEND_START && + mad_send_wr->timeout) { + wait_for_response(mad_send_wr); goto done; } /* Remove send from MAD agent and notify client of completion */ - list_del(&mad_send_wr->agent_list); + if (mad_send_wr->state != IB_MAD_STATE_DONE) + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); adjust_timeout(mad_agent_priv); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_send_wr->status != IB_WC_SUCCESS ) - mad_send_wc->status = mad_send_wr->status; - if (ret == IB_RMPP_RESULT_INTERNAL) + if (ret == IB_RMPP_RESULT_INTERNAL) { ib_rmpp_send_handler(mad_send_wc); - else + } else { + if (mad_send_wr->is_solicited_fc) + process_backlog_mads(mad_agent_priv); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, mad_send_wc); + } /* Release reference on agent taken when sending */ deref_mad_agent(mad_agent_priv); @@ -2496,6 +2523,9 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) send_queue = mad_list->mad_queue; qp_info = send_queue->qp_info; + trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv); + trace_ib_mad_send_done_handler(mad_send_wr, wc); + retry: ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->header_mapping, @@ -2521,12 +2551,10 @@ retry: mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_send_wc.status = wc->status; mad_send_wc.vendor_err = wc->vendor_err; - if (atomic_read(&qp_info->snoop_count)) - snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc, - IB_MAD_SNOOP_SEND_COMPLETIONS); ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { + trace_ib_mad_send_done_resend(queued_send_wr, qp_info); ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, NULL); if (ret) { @@ -2574,6 +2602,7 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, if (mad_send_wr->retry) { /* Repost send */ mad_send_wr->retry = 0; + trace_ib_mad_error_handler(mad_send_wr, qp_info); ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, NULL); if (!ret) @@ -2602,40 +2631,53 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, return true; } +static void clear_mad_error_list(struct list_head *list, + enum ib_wc_status wc_status, + struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr, *n; + struct ib_mad_send_wc mad_send_wc; + + mad_send_wc.status = wc_status; + mad_send_wc.vendor_err = 0; + + list_for_each_entry_safe(mad_send_wr, n, list, agent_list) { + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + deref_mad_agent(mad_agent_priv); + } +} + static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) { unsigned long flags; struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; - struct ib_mad_send_wc mad_send_wc; struct list_head cancel_list; INIT_LIST_HEAD(&cancel_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, - &mad_agent_priv->send_list, agent_list) { - if (mad_send_wr->status == IB_WC_SUCCESS) { - mad_send_wr->status = IB_WC_WR_FLUSH_ERR; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } - } - - /* Empty wait list to prevent receives from finding a request */ - list_splice_init(&mad_agent_priv->wait_list, &cancel_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + &mad_agent_priv->send_list, agent_list) + change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); - /* Report all cancelled requests */ - mad_send_wc.status = IB_WC_WR_FLUSH_ERR; - mad_send_wc.vendor_err = 0; + /* Empty wait & backlog list to prevent receives from finding request */ + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &mad_agent_priv->wait_list, agent_list) { + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, &cancel_list); + } list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, - &cancel_list, agent_list) { - mad_send_wc.send_buf = &mad_send_wr->send_buf; - list_del(&mad_send_wr->agent_list); - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); - atomic_dec(&mad_agent_priv->refcount); + &mad_agent_priv->backlog_list, agent_list) { + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, &cancel_list); } + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + /* Report all cancelled requests */ + clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv); } static struct ib_mad_send_wr_private* @@ -2657,31 +2699,40 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv, &mad_send_wr->send_buf == send_buf) return mad_send_wr; } + + list_for_each_entry(mad_send_wr, &mad_agent_priv->backlog_list, + agent_list) { + if (&mad_send_wr->send_buf == send_buf) + return mad_send_wr; + } + return NULL; } -int ib_modify_mad(struct ib_mad_agent *mad_agent, - struct ib_mad_send_buf *send_buf, u32 timeout_ms) +int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; unsigned long flags; int active; - mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, - agent); + if (!send_buf) + return -EINVAL; + + mad_agent_priv = container_of(send_buf->mad_agent, + struct ib_mad_agent_private, agent); spin_lock_irqsave(&mad_agent_priv->lock, flags); mad_send_wr = find_send_wr(mad_agent_priv, send_buf); - if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) { + if (!mad_send_wr || mad_send_wr->state == IB_MAD_STATE_CANCELED) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); return -EINVAL; } - active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); - if (!timeout_ms) { - mad_send_wr->status = IB_WC_WR_FLUSH_ERR; - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); - } + active = ((mad_send_wr->state == IB_MAD_STATE_SEND_START) || + (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) || + (mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms)); + if (!timeout_ms) + change_mad_state(mad_send_wr, IB_MAD_STATE_CANCELED); mad_send_wr->send_buf.timeout_ms = timeout_ms; if (active) @@ -2694,13 +2745,6 @@ int ib_modify_mad(struct ib_mad_agent *mad_agent, } EXPORT_SYMBOL(ib_modify_mad); -void ib_cancel_mad(struct ib_mad_agent *mad_agent, - struct ib_mad_send_buf *send_buf) -{ - ib_modify_mad(mad_agent, send_buf, 0); -} -EXPORT_SYMBOL(ib_cancel_mad); - static void local_completions(struct work_struct *work) { struct ib_mad_agent_private *mad_agent_priv; @@ -2763,16 +2807,12 @@ static void local_completions(struct work_struct *work) local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = (struct ib_mad *)local->mad_priv->mad; - if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) - snoop_recv(recv_mad_agent->qp_info, - &local->mad_priv->header.recv_wc, - IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); - atomic_dec(&recv_mad_agent->refcount); + deref_mad_agent(recv_mad_agent); spin_unlock_irqrestore(&recv_mad_agent->lock, flags); } @@ -2781,15 +2821,11 @@ local_send_completion: mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &local->mad_send_wr->send_buf; - if (atomic_read(&mad_agent_priv->qp_info->snoop_count)) - snoop_send(mad_agent_priv->qp_info, - &local->mad_send_wr->send_buf, - &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); spin_lock_irqsave(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); if (free_mad) kfree(local->mad_priv); kfree(local); @@ -2808,6 +2844,11 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->send_buf.retries++; mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); + if (mad_send_wr->is_solicited_fc && + !list_empty(&mad_send_wr->mad_agent_priv->backlog_list)) { + change_mad_state(mad_send_wr, IB_MAD_STATE_QUEUED); + return 0; + } if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); @@ -2825,24 +2866,25 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) } else ret = ib_send_mad(mad_send_wr); - if (!ret) { - mad_send_wr->refcount++; - list_add_tail(&mad_send_wr->agent_list, - &mad_send_wr->mad_agent_priv->send_list); - } + if (!ret) + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); + return ret; } static void timeout_sends(struct work_struct *work) { - struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; - struct ib_mad_send_wc mad_send_wc; + struct ib_mad_agent_private *mad_agent_priv; + struct list_head timeout_list; + struct list_head cancel_list; + struct list_head *list_item; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); - mad_send_wc.vendor_err = 0; + INIT_LIST_HEAD(&timeout_list); + INIT_LIST_HEAD(&cancel_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { @@ -2860,25 +2902,22 @@ static void timeout_sends(struct work_struct *work) break; } - list_del(&mad_send_wr->agent_list); - if (mad_send_wr->status == IB_WC_SUCCESS && - !retry_send(mad_send_wr)) - continue; - - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - - if (mad_send_wr->status == IB_WC_SUCCESS) - mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; + if (mad_send_wr->state == IB_MAD_STATE_CANCELED) + list_item = &cancel_list; + else if (retry_send(mad_send_wr)) + list_item = &timeout_list; else - mad_send_wc.status = mad_send_wr->status; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, - &mad_send_wc); + continue; - atomic_dec(&mad_agent_priv->refcount); - spin_lock_irqsave(&mad_agent_priv->lock, flags); + change_mad_state(mad_send_wr, IB_MAD_STATE_DONE); + list_add_tail(&mad_send_wr->agent_list, list_item); } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + process_backlog_mads(mad_agent_priv); + clear_mad_error_list(&timeout_list, IB_WC_RESP_TIMEOUT_ERR, + mad_agent_priv); + clear_mad_error_list(&cancel_list, IB_WC_WR_FLUSH_ERR, mad_agent_priv); } /* @@ -2888,11 +2927,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, struct ib_mad_private *mad) { unsigned long flags; - int post, ret; struct ib_mad_private *mad_priv; struct ib_sge sg_list; struct ib_recv_wr recv_wr; struct ib_mad_queue *recv_queue = &qp_info->recv_queue; + int ret = 0; /* Initialize common scatter list fields */ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; @@ -2902,7 +2941,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, recv_wr.sg_list = &sg_list; recv_wr.num_sge = 1; - do { + while (true) { /* Allocate and map receive buffer */ if (mad) { mad_priv = mad; @@ -2910,10 +2949,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, } else { mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), GFP_ATOMIC); - if (!mad_priv) { - ret = -ENOMEM; - break; - } + if (!mad_priv) + return -ENOMEM; } sg_list.length = mad_priv_dma_size(mad_priv); sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, @@ -2923,35 +2960,40 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { ret = -ENOMEM; - break; + goto free_mad_priv; } mad_priv->header.mapping = sg_list.addr; mad_priv->header.mad_list.mad_queue = recv_queue; mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; - - /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); - post = (++recv_queue->count < recv_queue->max_active); - list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); + if (recv_queue->count >= recv_queue->max_active) { + /* Fully populated the receive queue */ + spin_unlock_irqrestore(&recv_queue->lock, flags); + break; + } + recv_queue->count++; + list_add_tail(&mad_priv->header.mad_list.list, + &recv_queue->list); spin_unlock_irqrestore(&recv_queue->lock, flags); + ret = ib_post_recv(qp_info->qp, &recv_wr, NULL); if (ret) { spin_lock_irqsave(&recv_queue->lock, flags); list_del(&mad_priv->header.mad_list.list); recv_queue->count--; spin_unlock_irqrestore(&recv_queue->lock, flags); - ib_dma_unmap_single(qp_info->port_priv->device, - mad_priv->header.mapping, - mad_priv_dma_size(mad_priv), - DMA_FROM_DEVICE); - kfree(mad_priv); dev_err(&qp_info->port_priv->device->dev, "ib_post_recv failed: %d\n", ret); break; } - } while (post); + } + ib_dma_unmap_single(qp_info->port_priv->device, + mad_priv->header.mapping, + mad_priv_dma_size(mad_priv), DMA_FROM_DEVICE); +free_mad_priv: + kfree(mad_priv); return ret; } @@ -3080,7 +3122,7 @@ static void qp_event_handler(struct ib_event *event, void *qp_context) /* It's worse than that! He's dead, Jim! */ dev_err(&qp_info->port_priv->device->dev, - "Fatal error (%d) on MAD QP (%d)\n", + "Fatal error (%d) on MAD QP (%u)\n", event->event, qp_info->qp->qp_num); } @@ -3100,10 +3142,6 @@ static void init_mad_qp(struct ib_mad_port_private *port_priv, init_mad_queue(qp_info, &qp_info->send_queue); init_mad_queue(qp_info, &qp_info->recv_queue); INIT_LIST_HEAD(&qp_info->overflow_list); - spin_lock_init(&qp_info->snoop_lock); - qp_info->snoop_table = NULL; - qp_info->snoop_table_size = 0; - atomic_set(&qp_info->snoop_count, 0); } static int create_mad_qp(struct ib_mad_qp_info *qp_info, @@ -3147,7 +3185,6 @@ static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) return; ib_destroy_qp(qp_info->qp); - kfree(qp_info->snoop_table); } /* @@ -3155,12 +3192,11 @@ static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) * Create the QP, PD, MR, and CQ if needed */ static int ib_mad_port_open(struct ib_device *device, - int port_num) + u32 port_num) { int ret, cq_size; struct ib_mad_port_private *port_priv; unsigned long flags; - char name[sizeof "ib_mad123"]; int has_smi; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) @@ -3186,18 +3222,18 @@ static int ib_mad_port_open(struct ib_device *device, if (has_smi) cq_size *= 2; + port_priv->pd = ib_alloc_pd(device, 0); + if (IS_ERR(port_priv->pd)) { + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); + ret = PTR_ERR(port_priv->pd); + goto error3; + } + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, IB_POLL_UNBOUND_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); - goto error3; - } - - port_priv->pd = ib_alloc_pd(device, 0); - if (IS_ERR(port_priv->pd)) { - dev_err(&device->dev, "Couldn't create ib_mad PD\n"); - ret = PTR_ERR(port_priv->pd); goto error4; } @@ -3206,12 +3242,15 @@ static int ib_mad_port_open(struct ib_device *device, if (ret) goto error6; } - ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); - if (ret) - goto error7; - snprintf(name, sizeof name, "ib_mad%d", port_num); - port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + if (rdma_cap_ib_cm(device, port_num)) { + ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); + if (ret) + goto error7; + } + + port_priv->wq = alloc_ordered_workqueue("ib_mad%u", WQ_MEM_RECLAIM, + port_num); if (!port_priv->wq) { ret = -ENOMEM; goto error8; @@ -3240,11 +3279,11 @@ error8: error7: destroy_mad_qp(&port_priv->qp_info[0]); error6: - ib_dealloc_pd(port_priv->pd); -error4: ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); +error4: + ib_dealloc_pd(port_priv->pd); error3: kfree(port_priv); @@ -3256,7 +3295,7 @@ error3: * If there are no classes using the port, free the port * resources (CQ, MR, PD, QP) and remove the port's info structure */ -static int ib_mad_port_close(struct ib_device *device, int port_num) +static int ib_mad_port_close(struct ib_device *device, u32 port_num) { struct ib_mad_port_private *port_priv; unsigned long flags; @@ -3265,7 +3304,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - dev_err(&device->dev, "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %u not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); @@ -3274,8 +3313,8 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) destroy_workqueue(port_priv->wq); destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); - ib_dealloc_pd(port_priv->pd); ib_free_cq(port_priv->cq); + ib_dealloc_pd(port_priv->pd); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ @@ -3285,9 +3324,11 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) return 0; } -static void ib_mad_init_device(struct ib_device *device) +static int ib_mad_init_device(struct ib_device *device) { int start, i; + unsigned int count = 0; + int ret; start = rdma_start_port(device); @@ -3295,17 +3336,23 @@ static void ib_mad_init_device(struct ib_device *device) if (!rdma_cap_ib_mad(device, i)) continue; - if (ib_mad_port_open(device, i)) { + ret = ib_mad_port_open(device, i); + if (ret) { dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } - if (ib_agent_port_open(device, i)) { + ret = ib_agent_port_open(device, i); + if (ret) { dev_err(&device->dev, "Couldn't open port %d for agents\n", i); goto error_agent; } + count++; } - return; + if (!count) + return -EOPNOTSUPP; + + return 0; error_agent: if (ib_mad_port_close(device, i)) @@ -3322,21 +3369,22 @@ error: if (ib_mad_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d\n", i); } + return ret; } static void ib_mad_remove_device(struct ib_device *device, void *client_data) { - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { if (!rdma_cap_ib_mad(device, i)) continue; if (ib_agent_port_close(device, i)) dev_err(&device->dev, - "Couldn't close port %d for agents\n", i); + "Couldn't close port %u for agents\n", i); if (ib_mad_port_close(device, i)) - dev_err(&device->dev, "Couldn't close port %d\n", i); + dev_err(&device->dev, "Couldn't close port %u\n", i); } } @@ -3356,9 +3404,6 @@ int ib_mad_init(void) INIT_LIST_HEAD(&ib_mad_port_list); - /* Client ID 0 is used for snoop-only clients */ - idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL); - if (ib_register_client(&mad_client)) { pr_err("Couldn't register ib_mad client\n"); return -EINVAL; diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 216509036aa8..f444357d33f4 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -73,19 +73,19 @@ struct ib_mad_private_header { struct ib_mad_recv_wc recv_wc; struct ib_wc wc; u64 mapping; -} __attribute__ ((packed)); +} __packed; struct ib_mad_private { struct ib_mad_private_header header; size_t mad_size; struct ib_grh grh; - u8 mad[0]; -} __attribute__ ((packed)); + u8 mad[]; +} __packed; struct ib_rmpp_segment { struct list_head list; u32 num; - u8 data[0]; + u8 data[]; }; struct ib_mad_agent_private { @@ -95,15 +95,18 @@ struct ib_mad_agent_private { spinlock_t lock; struct list_head send_list; + unsigned int sol_fc_send_count; struct list_head wait_list; - struct list_head done_list; + unsigned int sol_fc_wait_count; struct delayed_work timed_work; unsigned long timeout; struct list_head local_list; struct work_struct local_work; struct list_head rmpp_list; + unsigned int sol_fc_max; + struct list_head backlog_list; - atomic_t refcount; + refcount_t refcount; union { struct completion comp; struct rcu_head rcu; @@ -115,10 +118,35 @@ struct ib_mad_snoop_private { struct ib_mad_qp_info *qp_info; int snoop_index; int mad_snoop_flags; - atomic_t refcount; struct completion comp; }; +enum ib_mad_state { + /* MAD is in the making and is not yet in any list */ + IB_MAD_STATE_INIT, + /* MAD is in backlog list */ + IB_MAD_STATE_QUEUED, + /* + * MAD was sent to the QP and is waiting for completion + * notification in send list. + */ + IB_MAD_STATE_SEND_START, + /* + * MAD send completed successfully, waiting for a response + * in wait list. + */ + IB_MAD_STATE_WAIT_RESP, + /* + * Response came early, before send completion notification, + * in send list. + */ + IB_MAD_STATE_EARLY_RESP, + /* MAD was canceled while in wait or send list */ + IB_MAD_STATE_CANCELED, + /* MAD processing completed, MAD in no list */ + IB_MAD_STATE_DONE +}; + struct ib_mad_send_wr_private { struct ib_mad_list_head mad_list; struct list_head agent_list; @@ -133,8 +161,6 @@ struct ib_mad_send_wr_private { int max_retries; int retries_left; int retry; - int refcount; - enum ib_wc_status status; /* RMPP control */ struct list_head rmpp_list; @@ -144,8 +170,48 @@ struct ib_mad_send_wr_private { int seg_num; int newwin; int pad; + + enum ib_mad_state state; + + /* Solicited MAD flow control */ + bool is_solicited_fc; }; +static inline void expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state); +} + +static inline void expect_mad_state2(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state1, + enum ib_mad_state expected_state2) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state1 && + mad_send_wr->state != expected_state2); +} + +static inline void expect_mad_state3(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state expected_state1, + enum ib_mad_state expected_state2, + enum ib_mad_state expected_state3) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state != expected_state1 && + mad_send_wr->state != expected_state2 && + mad_send_wr->state != expected_state3); +} + +static inline void +not_expect_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state wrong_state) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + WARN_ON(mad_send_wr->state == wrong_state); +} + struct ib_mad_local_private { struct list_head completion_list; struct ib_mad_private *mad_priv; @@ -223,4 +289,7 @@ void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, unsigned long timeout_ms); +void change_mad_state(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_mad_state new_state); + #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 5ec57abc0849..1c5e0eaf1c94 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -40,8 +40,7 @@ enum rmpp_state { RMPP_STATE_ACTIVE, RMPP_STATE_TIMEOUT, - RMPP_STATE_COMPLETE, - RMPP_STATE_CANCELING + RMPP_STATE_COMPLETE }; struct mad_rmpp_recv { @@ -52,7 +51,7 @@ struct mad_rmpp_recv { struct completion comp; enum rmpp_state state; spinlock_t lock; - atomic_t refcount; + refcount_t refcount; struct ib_ah *ah; struct ib_mad_recv_wc *rmpp_wc; @@ -73,7 +72,7 @@ struct mad_rmpp_recv { static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { - if (atomic_dec_and_test(&rmpp_recv->refcount)) + if (refcount_dec_and_test(&rmpp_recv->refcount)) complete(&rmpp_recv->comp); } @@ -92,22 +91,18 @@ void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent) spin_lock_irqsave(&agent->lock, flags); list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { - if (rmpp_recv->state != RMPP_STATE_COMPLETE) - ib_free_recv_mad(rmpp_recv->rmpp_wc); - rmpp_recv->state = RMPP_STATE_CANCELING; - } - spin_unlock_irqrestore(&agent->lock, flags); - - list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { cancel_delayed_work(&rmpp_recv->timeout_work); cancel_delayed_work(&rmpp_recv->cleanup_work); } + spin_unlock_irqrestore(&agent->lock, flags); flush_workqueue(agent->qp_info->port_priv->wq); list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv, &agent->rmpp_list, list) { list_del(&rmpp_recv->list); + if (rmpp_recv->state != RMPP_STATE_COMPLETE) + ib_free_recv_mad(rmpp_recv->rmpp_wc); destroy_rmpp_recv(rmpp_recv); } } @@ -163,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, recv_wc->recv_buf.grh, agent->port_num); if (IS_ERR(ah)) - return (void *) ah; + return ERR_CAST(ah); hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, @@ -272,10 +267,6 @@ static void recv_cleanup_handler(struct work_struct *work) unsigned long flags; spin_lock_irqsave(&rmpp_recv->agent->lock, flags); - if (rmpp_recv->state == RMPP_STATE_CANCELING) { - spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); - return; - } list_del(&rmpp_recv->list); spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); destroy_rmpp_recv(rmpp_recv); @@ -305,7 +296,7 @@ create_rmpp_recv(struct ib_mad_agent_private *agent, INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler); spin_lock_init(&rmpp_recv->lock); rmpp_recv->state = RMPP_STATE_ACTIVE; - atomic_set(&rmpp_recv->refcount, 1); + refcount_set(&rmpp_recv->refcount, 1); rmpp_recv->rmpp_wc = mad_recv_wc; rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf; @@ -357,7 +348,7 @@ acquire_rmpp_recv(struct ib_mad_agent_private *agent, spin_lock_irqsave(&agent->lock, flags); rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); if (rmpp_recv) - atomic_inc(&rmpp_recv->refcount); + refcount_inc(&rmpp_recv->refcount); spin_unlock_irqrestore(&agent->lock, flags); return rmpp_recv; } @@ -391,8 +382,8 @@ static inline int get_seg_num(struct ib_mad_recv_buf *seg) return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); } -static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list, - struct ib_mad_recv_buf *seg) +static inline struct ib_mad_recv_buf *get_next_seg(struct list_head *rmpp_list, + struct ib_mad_recv_buf *seg) { if (seg->list.next == rmpp_list) return NULL; @@ -405,8 +396,8 @@ static inline int window_size(struct ib_mad_agent_private *agent) return max(agent->qp_info->recv_queue.max_active >> 3, 1); } -static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list, - int seg_num) +static struct ib_mad_recv_buf *find_seg_location(struct list_head *rmpp_list, + int seg_num) { struct ib_mad_recv_buf *seg_buf; int cur_seg_num; @@ -458,7 +449,7 @@ static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv) return hdr_size + rmpp_recv->seg_num * data_size - pad; } -static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv) +static struct ib_mad_recv_wc *complete_rmpp(struct mad_rmpp_recv *rmpp_recv) { struct ib_mad_recv_wc *rmpp_wc; @@ -553,7 +544,7 @@ start_rmpp(struct ib_mad_agent_private *agent, destroy_rmpp_recv(rmpp_recv); return continue_rmpp(agent, mad_recv_wc); } - atomic_inc(&rmpp_recv->refcount); + refcount_inc(&rmpp_recv->refcount); if (get_last_flag(&mad_recv_wc->recv_buf)) { rmpp_recv->state = RMPP_STATE_COMPLETE; @@ -617,16 +608,20 @@ static void abort_send(struct ib_mad_agent_private *agent, goto out; /* Unmatched send */ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || - (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + (!mad_send_wr->timeout) || + (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ ib_mark_mad_done(mad_send_wr); + if (mad_send_wr->state == IB_MAD_STATE_DONE) { + spin_unlock_irqrestore(&agent->lock, flags); + wc.status = IB_WC_REM_ABORT_ERR; + wc.vendor_err = rmpp_status; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } spin_unlock_irqrestore(&agent->lock, flags); - - wc.status = IB_WC_REM_ABORT_ERR; - wc.vendor_err = rmpp_status; - wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &wc); return; out: spin_unlock_irqrestore(&agent->lock, flags); @@ -693,7 +688,8 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, } if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || - (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + (!mad_send_wr->timeout) || + (mad_send_wr->state == IB_MAD_STATE_CANCELED)) goto out; /* Send is already done */ if (seg_num > mad_send_wr->send_buf.seg_count || @@ -718,21 +714,24 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, struct ib_mad_send_wc wc; ib_mark_mad_done(mad_send_wr); + if (mad_send_wr->state == IB_MAD_STATE_DONE) { + spin_unlock_irqrestore(&agent->lock, flags); + wc.status = IB_WC_SUCCESS; + wc.vendor_err = 0; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } spin_unlock_irqrestore(&agent->lock, flags); - - wc.status = IB_WC_SUCCESS; - wc.vendor_err = 0; - wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &wc); return; } - if (mad_send_wr->refcount == 1) + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) ib_reset_mad_timeout(mad_send_wr, mad_send_wr->send_buf.timeout_ms); spin_unlock_irqrestore(&agent->lock, flags); ack_ds_ack(agent, mad_recv_wc); return; - } else if (mad_send_wr->refcount == 1 && + } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP && mad_send_wr->seg_num < mad_send_wr->newwin && mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { /* Send failure will just result in a timeout/retry */ @@ -740,7 +739,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, if (ret) goto out; - mad_send_wr->refcount++; + change_mad_state(mad_send_wr, IB_MAD_STATE_SEND_START); list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } @@ -899,7 +898,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->newwin = init_newwin(mad_send_wr); /* We need to wait for the final ACK even if there isn't a response */ - mad_send_wr->refcount += (mad_send_wr->timeout == 0); ret = send_next_seg(mad_send_wr); if (!ret) return IB_RMPP_RESULT_CONSUMED; @@ -921,7 +919,7 @@ int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */ if (mad_send_wc->status != IB_WC_SUCCESS || - mad_send_wr->status != IB_WC_SUCCESS) + mad_send_wr->state == IB_MAD_STATE_CANCELED) return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */ if (!mad_send_wr->timeout) diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c index 49d478b2ea94..c0e2df128b34 100644 --- a/drivers/infiniband/core/mr_pool.c +++ b/drivers/infiniband/core/mr_pool.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <rdma/ib_verbs.h> #include <rdma/mr_pool.h> @@ -42,14 +34,18 @@ void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) EXPORT_SYMBOL(ib_mr_pool_put); int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, - enum ib_mr_type type, u32 max_num_sg) + enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg) { struct ib_mr *mr; unsigned long flags; int ret, i; for (i = 0; i < nr; i++) { - mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (type == IB_MR_TYPE_INTEGRITY) + mr = ib_alloc_mr_integrity(qp->pd, max_num_sg, + max_num_meta_sg); + else + mr = ib_alloc_mr(qp->pd, type, max_num_sg); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto out; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index d50ff70bb24b..a236532a9026 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -42,7 +42,7 @@ #include <rdma/ib_cache.h> #include "sa.h" -static void mcast_add_one(struct ib_device *device); +static int mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { @@ -61,9 +61,9 @@ struct mcast_port { struct mcast_device *dev; spinlock_t lock; struct rb_root table; - atomic_t refcount; + refcount_t refcount; struct completion comp; - u8 port_num; + u32 port_num; }; struct mcast_device { @@ -71,7 +71,7 @@ struct mcast_device { struct ib_event_handler event_handler; int start_port; int end_port; - struct mcast_port port[0]; + struct mcast_port port[]; }; enum mcast_state { @@ -117,7 +117,7 @@ struct mcast_member { struct mcast_group *group; struct list_head list; enum mcast_state state; - atomic_t refcount; + refcount_t refcount; struct completion comp; }; @@ -178,7 +178,7 @@ static struct mcast_group *mcast_insert(struct mcast_port *port, static void deref_port(struct mcast_port *port) { - if (atomic_dec_and_test(&port->refcount)) + if (refcount_dec_and_test(&port->refcount)) complete(&port->comp); } @@ -199,7 +199,7 @@ static void release_group(struct mcast_group *group) static void deref_member(struct mcast_member *member) { - if (atomic_dec_and_test(&member->refcount)) + if (refcount_dec_and_test(&member->refcount)) complete(&member->comp); } @@ -401,7 +401,7 @@ static void process_group_error(struct mcast_group *group) while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); - atomic_inc(&member->refcount); + refcount_inc(&member->refcount); list_del_init(&member->list); adjust_membership(group, member->multicast.rec.join_state, -1); member->state = MCAST_ERROR; @@ -445,7 +445,7 @@ retest: struct mcast_member, list); multicast = &member->multicast; join_state = multicast->rec.join_state; - atomic_inc(&member->refcount); + refcount_inc(&member->refcount); if (join_state == (group->rec.join_state & join_state)) { status = cmp_rec(&group->rec, &multicast->rec, @@ -497,7 +497,7 @@ static void process_join_error(struct mcast_group *group, int status) member = list_entry(group->pending_list.next, struct mcast_member, list); if (group->last_join == member) { - atomic_inc(&member->refcount); + refcount_inc(&member->refcount); list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = member->multicast.callback(status, &member->multicast); @@ -589,7 +589,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port, kfree(group); group = cur_group; } else - atomic_inc(&port->refcount); + refcount_inc(&port->refcount); found: atomic_inc(&group->refcount); spin_unlock_irqrestore(&port->lock, flags); @@ -605,7 +605,7 @@ found: */ struct ib_sa_multicast * ib_sa_join_multicast(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, @@ -632,7 +632,7 @@ ib_sa_join_multicast(struct ib_sa_client *client, member->multicast.callback = callback; member->multicast.context = context; init_completion(&member->comp); - atomic_set(&member->refcount, 1); + refcount_set(&member->refcount, 1); member->state = MCAST_JOINING; member->group = acquire_group(&dev->port[port_num - dev->start_port], @@ -690,7 +690,7 @@ void ib_sa_free_multicast(struct ib_sa_multicast *multicast) } EXPORT_SYMBOL(ib_sa_free_multicast); -int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, +int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) { struct mcast_device *dev; @@ -721,6 +721,7 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec); * member record and gid of the device. * @device: RDMA device * @port_num: Port of the rdma device to consider + * @rec: Multicast member record to use * @ndev: Optional netdevice, applicable only for RoCE * @gid_type: GID type to consider * @ah_attr: AH attribute to fillup on successful completion @@ -731,7 +732,7 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec); * success or appropriate error code. * */ -int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, +int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, @@ -804,7 +805,6 @@ static void mcast_event_handler(struct ib_event_handler *handler, switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: - case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; @@ -816,7 +816,7 @@ static void mcast_event_handler(struct ib_event_handler *handler, } } -static void mcast_add_one(struct ib_device *device) +static int mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; @@ -826,7 +826,7 @@ static void mcast_add_one(struct ib_device *device) dev = kmalloc(struct_size(dev, port, device->phys_port_cnt), GFP_KERNEL); if (!dev) - return; + return -ENOMEM; dev->start_port = rdma_start_port(device); dev->end_port = rdma_end_port(device); @@ -840,13 +840,13 @@ static void mcast_add_one(struct ib_device *device) spin_lock_init(&port->lock); port->table = RB_ROOT; init_completion(&port->comp); - atomic_set(&port->refcount, 1); + refcount_set(&port->refcount, 1); ++count; } if (!count) { kfree(dev); - return; + return -EOPNOTSUPP; } dev->device = device; @@ -854,6 +854,7 @@ static void mcast_add_one(struct ib_device *device) INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); + return 0; } static void mcast_remove_one(struct ib_device *device, void *client_data) @@ -862,9 +863,6 @@ static void mcast_remove_one(struct ib_device *device, void *client_data) struct mcast_port *port; int i; - if (!dev) - return; - ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 724f5a62e82f..def14c54b648 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -36,27 +36,31 @@ #include <linux/export.h> #include <net/netlink.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> #include <linux/module.h> #include "core_priv.h" -static DEFINE_MUTEX(rdma_nl_mutex); -static struct sock *nls; static struct { - const struct rdma_nl_cbs *cb_table; + const struct rdma_nl_cbs *cb_table; + /* Synchronizes between ongoing netlink commands and netlink client + * unregistration. + */ + struct rw_semaphore sem; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; bool rdma_nl_chk_listeners(unsigned int group) { - return netlink_has_listeners(nls, group); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net); + + return netlink_has_listeners(rnet->nl_sock, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { - [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS, [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, @@ -71,65 +75,56 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op) if (type >= RDMA_NL_NUM_CLIENTS) return false; - return (op < max_num_ops[type]) ? true : false; + return op < max_num_ops[type]; } -static bool is_nl_valid(unsigned int type, unsigned int op) +static const struct rdma_nl_cbs * +get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op) { const struct rdma_nl_cbs *cb_table; - if (!is_nl_msg_valid(type, op)) - return false; + /* + * Currently only NLDEV client is supporting netlink commands in + * non init_net net namespace. + */ + if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV) + return NULL; - if (!rdma_nl_types[type].cb_table) { - mutex_unlock(&rdma_nl_mutex); - request_module("rdma-netlink-subsys-%d", type); - mutex_lock(&rdma_nl_mutex); - } + cb_table = READ_ONCE(rdma_nl_types[type].cb_table); + if (!cb_table) { + /* + * Didn't get valid reference of the table, attempt module + * load once. + */ + up_read(&rdma_nl_types[type].sem); - cb_table = rdma_nl_types[type].cb_table; + request_module("rdma-netlink-subsys-%u", type); + down_read(&rdma_nl_types[type].sem); + cb_table = READ_ONCE(rdma_nl_types[type].cb_table); + } if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) - return false; - return true; + return NULL; + return cb_table; } void rdma_nl_register(unsigned int index, const struct rdma_nl_cbs cb_table[]) { - mutex_lock(&rdma_nl_mutex); - if (!is_nl_msg_valid(index, 0)) { - /* - * All clients are not interesting in success/failure of - * this call. They want to see the print to error log and - * continue their initialization. Print warning for them, - * because it is programmer's error to be here. - */ - mutex_unlock(&rdma_nl_mutex); - WARN(true, - "The not-valid %u index was supplied to RDMA netlink\n", - index); + if (WARN_ON(!is_nl_msg_valid(index, 0)) || + WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table))) return; - } - if (rdma_nl_types[index].cb_table) { - mutex_unlock(&rdma_nl_mutex); - WARN(true, - "The %u index is already registered in RDMA netlink\n", - index); - return; - } - - rdma_nl_types[index].cb_table = cb_table; - mutex_unlock(&rdma_nl_mutex); + /* Pairs with the READ_ONCE in is_nl_valid() */ + smp_store_release(&rdma_nl_types[index].cb_table, cb_table); } EXPORT_SYMBOL(rdma_nl_register); void rdma_nl_unregister(unsigned int index) { - mutex_lock(&rdma_nl_mutex); + down_write(&rdma_nl_types[index].sem); rdma_nl_types[index].cb_table = NULL; - mutex_unlock(&rdma_nl_mutex); + up_write(&rdma_nl_types[index].sem); } EXPORT_SYMBOL(rdma_nl_unregister); @@ -161,15 +156,21 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); const struct rdma_nl_cbs *cb_table; + int err = -EINVAL; - if (!is_nl_valid(index, op)) + if (!is_nl_msg_valid(index, op)) return -EINVAL; - cb_table = rdma_nl_types[index].cb_table; + down_read(&rdma_nl_types[index].sem); + cb_table = get_cb_table(skb, index, op); + if (!cb_table) + goto done; if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && - !netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; + !netlink_capable(skb, CAP_NET_ADMIN)) { + err = -EPERM; + goto done; + } /* * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't @@ -177,24 +178,24 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, */ if (index == RDMA_NL_LS) { if (cb_table[op].doit) - return cb_table[op].doit(skb, nlh, extack); - return -EINVAL; + err = cb_table[op].doit(skb, nlh, extack); + goto done; } /* FIXME: Convert IWCM to properly handle doit callbacks */ - if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || - index == RDMA_NL_IWCM) { + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { struct netlink_dump_control c = { .dump = cb_table[op].dump, }; if (c.dump) - return netlink_dump_start(nls, skb, nlh, &c); - return -EINVAL; + err = netlink_dump_start(skb->sk, skb, nlh, &c); + goto done; } if (cb_table[op].doit) - return cb_table[op].doit(skb, nlh, extack); - - return 0; + err = cb_table[op].doit(skb, nlh, extack); +done: + up_read(&rdma_nl_types[index].sem); + return err; } /* @@ -255,57 +256,77 @@ skip: static void rdma_nl_rcv(struct sk_buff *skb) { - mutex_lock(&rdma_nl_mutex); rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); - mutex_unlock(&rdma_nl_mutex); } -int rdma_nl_unicast(struct sk_buff *skb, u32 pid) +int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid) { + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; - err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); + err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast); -int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) +int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid) { + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; - err = netlink_unicast(nls, skb, pid, 0); + err = netlink_unicast(rnet->nl_sock, skb, pid, 0); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast_wait); -int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags) +int rdma_nl_multicast(struct net *net, struct sk_buff *skb, + unsigned int group, gfp_t flags) { - return nlmsg_multicast(nls, skb, 0, group, flags); + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + + return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags); } EXPORT_SYMBOL(rdma_nl_multicast); -int __init rdma_nl_init(void) +void rdma_nl_init(void) { + int idx; + + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + init_rwsem(&rdma_nl_types[idx].sem); +} + +void rdma_nl_exit(void) +{ + int idx; + + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + WARN(rdma_nl_types[idx].cb_table, + "Netlink client %d wasn't released prior to unloading %s\n", + idx, KBUILD_MODNAME); +} + +int rdma_nl_net_init(struct rdma_dev_net *rnet) +{ + struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, + .flags = NL_CFG_F_NONROOT_RECV, }; + struct sock *nls; - nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); + nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); if (!nls) return -ENOMEM; nls->sk_sndtimeo = 10 * HZ; + rnet->nl_sock = nls; return 0; } -void rdma_nl_exit(void) +void rdma_nl_net_exit(struct rdma_dev_net *rnet) { - int idx; - - for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) - rdma_nl_unregister(idx); - - netlink_kernel_release(nls); + netlink_kernel_release(rnet->nl_sock); } MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 3c97a8b6bf1e..2220a2dfab24 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -33,80 +33,145 @@ #include <linux/module.h> #include <linux/pid.h> #include <linux/pid_namespace.h> +#include <linux/mutex.h> #include <net/netlink.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" #include "cma_priv.h" +#include "restrack.h" +#include "uverbs.h" +/* + * This determines whether a non-privileged user is allowed to specify a + * controlled QKEY or not, when true non-privileged user is allowed to specify + * a controlled QKEY. + */ +static bool privileged_qkey; + +typedef int (*res_fill_func_t)(struct sk_buff*, bool, + struct rdma_restrack_entry*, uint32_t); + +/* + * Sort array elements by the netlink attribute name + */ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { - [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, - .len = IB_DEVICE_NAME_MAX - 1}, - [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, - .len = IB_FW_VERSION_NAME_MAX - 1}, - [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING, - .len = 16 }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, - .len = TASK_COMM_LEN }, + [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, + [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, + [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, + [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { - .len = sizeof(struct __kernel_sockaddr_storage) }, - [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { - .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CTX] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CTX_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ }, - [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, - [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, - [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, - [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, + [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, + [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUBTYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SRQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_MIN_RANGE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_MAX_RANGE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, + [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, + [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, + [RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_DETAILS] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -146,6 +211,19 @@ static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, return 0; } +int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name, + const char *str) +{ + if (put_driver_name_print_type(msg, name, + RDMA_NLDEV_PRINT_TYPE_UNSPEC)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, str)) + return -EMSGSIZE; + + return 0; +} +EXPORT_SYMBOL(rdma_nl_put_driver_string); + int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value) { return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, @@ -175,6 +253,12 @@ int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value) } EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); +bool rdma_nl_get_privileged_qkey(void) +{ + return privileged_qkey; +} +EXPORT_SYMBOL(rdma_nl_get_privileged_qkey); + static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) @@ -189,6 +273,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) { char fw[IB_FW_VERSION_NAME_MAX]; + int ret = 0; + u32 port; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; @@ -217,7 +303,40 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; - return 0; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) + return -EMSGSIZE; + + if (device->type && + nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_TYPE, device->type)) + return -EMSGSIZE; + + if (device->parent && + nla_put_string(msg, RDMA_NLDEV_ATTR_PARENT_NAME, + dev_name(&device->parent->dev))) + return -EMSGSIZE; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, + device->name_assign_type)) + return -EMSGSIZE; + + /* + * Link type is determined on first port and mlx4 device + * which can potentially have two different link type for the same + * IB device is considered as better to be avoided in the future, + */ + port = rdma_start_port(device); + if (rdma_cap_opa_mad(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa"); + else if (rdma_protocol_ib(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib"); + else if (rdma_protocol_iwarp(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw"); + else if (rdma_protocol_roce(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce"); + else if (rdma_protocol_usnic(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, + "usnic"); + return ret; } static int fill_port_info(struct sk_buff *msg, @@ -262,9 +381,7 @@ static int fill_port_info(struct sk_buff *msg, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; - if (device->ops.get_netdev) - netdev = device->ops.get_netdev(device, port); - + netdev = ib_device_get_netdev(device, port); if (netdev && net_eq(dev_net(netdev), net)) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); @@ -275,8 +392,7 @@ static int fill_port_info(struct sk_buff *msg, } out: - if (netdev) - dev_put(netdev); + dev_put(netdev); return ret; } @@ -285,7 +401,8 @@ static int fill_res_info_entry(struct sk_buff *msg, { struct nlattr *entry_attr; - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); + entry_attr = nla_nest_start_noflag(msg, + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); if (!entry_attr) return -EMSGSIZE; @@ -303,7 +420,8 @@ err: return -EMSGSIZE; } -static int fill_res_info(struct sk_buff *msg, struct ib_device *device) +static int fill_res_info(struct sk_buff *msg, struct ib_device *device, + bool show_details) { static const char * const names[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_PD] = "pd", @@ -312,23 +430,23 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_CM_ID] = "cm_id", [RDMA_RESTRACK_MR] = "mr", [RDMA_RESTRACK_CTX] = "ctx", + [RDMA_RESTRACK_SRQ] = "srq", }; - struct rdma_restrack_root *res = &device->res; struct nlattr *table_attr; int ret, i, curr; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; - table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); + table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); if (!table_attr) return -EMSGSIZE; for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(res, i, task_active_pid_ns(current)); + curr = rdma_restrack_count(device, i, show_details); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; @@ -345,29 +463,42 @@ err: static int fill_res_name_pid(struct sk_buff *msg, struct rdma_restrack_entry *res) { + int err = 0; + /* * For user resources, user is should read /proc/PID/comm to get the * name of the task file. */ if (rdma_is_kernel_res(res)) { - if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, - res->kern_name)) - return -EMSGSIZE; + err = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, + res->kern_name); } else { - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, - task_pid_vnr(res->task))) - return -EMSGSIZE; + pid_t pid; + + pid = task_pid_vnr(res->task); + /* + * Task is dead and in zombie state. + * There is no need to print PID anymore. + */ + if (pid) + /* + * This part is racy, task can be killed and PID will + * be zero right here but it is ok, next query won't + * return PID. We don't promise real-time reflection + * of SW objects. + */ + err = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, pid); } - return 0; + + return err ? -EMSGSIZE : 0; } -static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, - struct rdma_restrack_entry *res, uint32_t port) +static int fill_res_qp_entry_query(struct sk_buff *msg, + struct rdma_restrack_entry *res, + struct ib_device *dev, + struct ib_qp *qp) { - struct ib_qp *qp = container_of(res, struct ib_qp, res); - struct rdma_restrack_root *resroot = &qp->device->res; struct ib_qp_init_attr qp_init_attr; - struct nlattr *entry_attr; struct ib_qp_attr qp_attr; int ret; @@ -375,20 +506,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, if (ret) return ret; - if (port && port != qp_attr.port_num) - return 0; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); - if (!entry_attr) - goto out; - - /* In create_qp() port is not set yet */ - if (qp_attr.port_num && - nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num)) - goto err; - - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num)) - goto err; if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, qp_attr.dest_qp_num)) @@ -412,37 +529,65 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; - if (fill_res_name_pid(msg, res)) - goto err; + if (dev->ops.fill_res_qp_entry) + return dev->ops.fill_res_qp_entry(msg, qp); + return 0; - if (resroot->fill_res_entry(msg, res)) - goto err; +err: return -EMSGSIZE; +} - nla_nest_end(msg, entry_attr); - return 0; +static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + int ret; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; + if (port && port != qp->port) + return -EAGAIN; + + /* In create_qp() port is not set yet */ + if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) + return -EMSGSIZE; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); + if (ret) + return -EMSGSIZE; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + return -EMSGSIZE; + + ret = fill_res_name_pid(msg, res); + if (ret) + return -EMSGSIZE; + + return fill_res_qp_entry_query(msg, res, dev, qp); } -static int fill_res_cm_id_entry(struct sk_buff *msg, - struct netlink_callback *cb, +static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + + if (port && port != qp->port) + return -EAGAIN; + if (!dev->ops.fill_res_qp_entry_raw) + return -EINVAL; + return dev->ops.fill_res_qp_entry_raw(msg, qp); +} + +static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); - struct rdma_restrack_root *resroot = &id_priv->id.device->res; + struct ib_device *dev = id_priv->id.device; struct rdma_cm_id *cm_id = &id_priv->id; - struct nlattr *entry_attr; if (port && port != cm_id->port_num) - return 0; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY); - if (!entry_attr) - goto out; + return -EAGAIN; if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) @@ -472,107 +617,113 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, &cm_id->route.addr.dst_addr)) goto err; - if (fill_res_name_pid(msg, res)) + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_name_pid(msg, res)) goto err; - nla_nest_end(msg, entry_attr); + if (dev->ops.fill_res_cm_id_entry) + return dev->ops.fill_res_cm_id_entry(msg, cm_id); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); - struct rdma_restrack_root *resroot = &cq->device->res; - struct nlattr *entry_attr; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY); - if (!entry_attr) - goto out; + struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) - goto err; + return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; /* Poll context is only valid for kernel CQs */ if (rdma_is_kernel_res(res) && nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) - goto err; + return -EMSGSIZE; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) + return -EMSGSIZE; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + cq->uobject->uevent.uobject.context->res.id)) + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (resroot->fill_res_entry(msg, res)) - goto err; + return (dev->ops.fill_res_cq_entry) ? + dev->ops.fill_res_cq_entry(msg, cq) : 0; +} - nla_nest_end(msg, entry_attr); - return 0; +static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct ib_device *dev = cq->device; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; + if (!dev->ops.fill_res_cq_entry_raw) + return -EINVAL; + return dev->ops.fill_res_cq_entry_raw(msg, cq); } -static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); - struct rdma_restrack_root *resroot = &mr->pd->device->res; - struct nlattr *entry_attr; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY); - if (!entry_attr) - goto out; + struct ib_device *dev = mr->pd->device; - if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) - goto err; + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) - goto err; + return -EMSGSIZE; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, RDMA_NLDEV_ATTR_PAD)) - goto err; + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + return -EMSGSIZE; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) + return -EMSGSIZE; if (fill_res_name_pid(msg, res)) - goto err; + return -EMSGSIZE; - if (resroot->fill_res_entry(msg, res)) - goto err; + return (dev->ops.fill_res_mr_entry) ? + dev->ops.fill_res_mr_entry(msg, mr) : + 0; +} - nla_nest_end(msg, entry_attr); - return 0; +static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; + if (!dev->ops.fill_res_mr_entry_raw) + return -EINVAL; + return dev->ops.fill_res_mr_entry_raw(msg, mr); } -static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); - struct rdma_restrack_root *resroot = &pd->device->res; - struct nlattr *entry_attr; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY); - if (!entry_attr) - goto out; - if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, pd->local_dma_lkey)) goto err; @@ -585,10 +736,256 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) + goto err; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + pd->uobject->context->res.id)) + goto err; + + return fill_res_name_pid(msg, res); + +err: return -EMSGSIZE; +} + +static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res); + + if (rdma_is_kernel_res(res)) + return 0; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id)) + return -EMSGSIZE; + + return fill_res_name_pid(msg, res); +} + +static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range, + uint32_t max_range) +{ + struct nlattr *entry_attr; + + if (!min_range) + return 0; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (min_range == max_range) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range)) + goto err; + } else { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range)) + goto err; + } + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq) +{ + uint32_t min_range = 0, prev = 0; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) + return -EMSGSIZE; + + rt = &srq->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + qp = container_of(res, struct ib_qp, res); + if (!qp->srq || (qp->srq->res.id != srq->res.id)) { + rdma_restrack_put(res); + continue; + } + + if (qp->qp_num < prev) + /* qp_num should be ascending */ + goto err_loop; + + if (min_range == 0) { + min_range = qp->qp_num; + } else if (qp->qp_num > (prev + 1)) { + if (fill_res_range_qp_entry(msg, min_range, prev)) + goto err_loop; + + min_range = qp->qp_num; + } + prev = qp->qp_num; + rdma_restrack_put(res); + } + + xa_unlock(&rt->xa); + + if (fill_res_range_qp_entry(msg, min_range, prev)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err_loop: + rdma_restrack_put(res); + xa_unlock(&rt->xa); +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_srq *srq = container_of(res, struct ib_srq, res); + struct ib_device *dev = srq->device; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id)) + goto err; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id)) + goto err; + + if (ib_srq_has_cq(srq->srq_type)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, + srq->ext.cq->res.id)) + goto err; + } + + if (fill_res_srq_qps(msg, srq)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (dev->ops.fill_res_srq_entry) + return dev->ops.fill_res_srq_entry(msg, srq); + + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_res_srq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_srq *srq = container_of(res, struct ib_srq, res); + struct ib_device *dev = srq->device; + + if (!dev->ops.fill_res_srq_entry_raw) + return -EINVAL; + return dev->ops.fill_res_srq_entry_raw(msg, srq); +} + +static int fill_stat_counter_mode(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_counter_mode *m = &counter->mode; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) + return -EMSGSIZE; + + if (m->mode == RDMA_COUNTER_MODE_AUTO) { + if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) + return -EMSGSIZE; + + if ((m->mask & RDMA_COUNTER_MASK_PID) && + fill_res_name_pid(msg, &counter->res)) + return -EMSGSIZE; + } + + return 0; +} + +static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_qps(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + int ret = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) + return -EMSGSIZE; + + rt = &counter->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + qp = container_of(res, struct ib_qp, res); + if (!qp->counter || (qp->counter->id != counter->id)) + continue; + + ret = fill_stat_counter_qp_entry(msg, qp->qp_num); + if (ret) + goto err; + } + + xa_unlock(&rt->xa); + nla_nest_end(msg, table_attr); + return 0; + +err: + xa_unlock(&rt->xa); + nla_nest_cancel(msg, table_attr); + return ret; +} + +int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name, + u64 value) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + name)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, + value, RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); @@ -596,9 +993,79 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, err: nla_nest_cancel(msg, entry_attr); -out: return -EMSGSIZE; } +EXPORT_SYMBOL(rdma_nl_stat_hwcounter_entry); + +static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + goto err; + + if (dev->ops.fill_stat_mr_entry) + return dev->ops.fill_stat_mr_entry(msg, mr); + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_stat_counter_hwcounters(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_hw_stats *st = counter->stats; + struct nlattr *table_attr; + int i; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) + return -EMSGSIZE; + + mutex_lock(&st->lock); + for (i = 0; i < st->num_counters; i++) { + if (test_bit(i, st->is_disabled)) + continue; + if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name, + st->value[i])) + goto err; + } + mutex_unlock(&st->lock); + + nla_nest_end(msg, table_attr); + return 0; + +err: + mutex_unlock(&st->lock); + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, + uint32_t port) +{ + struct rdma_counter *counter = + container_of(res, struct rdma_counter, res); + + if (port && port != counter->port) + return -EAGAIN; + + /* Dump it even query failed */ + rdma_counter_query_stats(counter); + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || + fill_stat_counter_mode(msg, counter) || + fill_stat_counter_qps(msg, counter) || + fill_stat_counter_hwcounters(msg, counter)) + return -EMSGSIZE; + + return 0; +} static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) @@ -609,14 +1076,14 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -629,6 +1096,10 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto err_free; + } err = fill_dev_info(msg, device); if (err) @@ -637,7 +1108,7 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -654,25 +1125,48 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, - extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { char name[IB_DEVICE_NAME_MAX] = {}; - nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], IB_DEVICE_NAME_MAX); + if (strlen(name) == 0) { + err = -EINVAL; + goto done; + } err = ib_device_rename(device, name); + goto done; + } + + if (tb[RDMA_NLDEV_NET_NS_FD]) { + u32 ns_fd; + + ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]); + err = ib_device_set_netns_put(skb, device, ns_fd); + goto put_done; + } + + if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { + u8 use_dim; + + use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); + err = ib_device_set_dim(device, use_dim); + goto done; } +done: ib_device_put(device); +put_done: return err; } @@ -691,7 +1185,7 @@ static int _nldev_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, NLM_F_MULTI); - if (fill_dev_info(skb, device)) { + if (!nlh || fill_dev_info(skb, device)) { nlmsg_cancel(skb, nlh); goto out; } @@ -708,7 +1202,7 @@ static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { /* * There is no need to take lock, because - * we are relying on ib_core's lists_rwsem + * we are relying on ib_core's locking. */ return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); } @@ -723,15 +1217,15 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 port; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -750,6 +1244,10 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto err_free; + } err = fill_port_info(msg, device, port, sock_net(skb->sk)); if (err) @@ -758,7 +1256,7 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -777,19 +1275,19 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, u32 idx = 0; u32 ifindex; int err; - u32 p; + unsigned int p; - err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(ifindex); + device = ib_device_get_by_index(sock_net(skb->sk), ifindex); if (!device) return -EINVAL; - for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + rdma_for_each_port (device, p) { /* * The dumpit function returns all information from specific * index. This specific index is taken from the netlink @@ -811,7 +1309,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, RDMA_NLDEV_CMD_PORT_GET), 0, NLM_F_MULTI); - if (fill_port_info(skb, device, p, sock_net(skb->sk))) { + if (!nlh || fill_port_info(skb, device, p, sock_net(skb->sk))) { nlmsg_cancel(skb, nlh); goto out; } @@ -829,21 +1327,25 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + bool show_details = false; struct ib_device *device; struct sk_buff *msg; u32 index; int ret; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) + show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; @@ -853,14 +1355,18 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_free; + } - ret = fill_res_info(msg, device); + ret = fill_res_info(msg, device, show_details); if (ret) goto err_free; nlmsg_end(msg, nlh); ib_device_put(device); - return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); err_free: nlmsg_free(msg); @@ -884,11 +1390,10 @@ static int _nldev_res_get_dumpit(struct ib_device *device, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), 0, NLM_F_MULTI); - if (fill_res_info(skb, device)) { + if (!nlh || fill_res_info(skb, device, false)) { nlmsg_cancel(skb, nlh); goto out; } - nlmsg_end(skb, nlh); idx++; @@ -905,57 +1410,170 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, } struct nldev_fill_res_entry { - int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb, - struct rdma_restrack_entry *res, u32 port); enum rdma_nldev_attr nldev_attr; - enum rdma_nldev_command nldev_cmd; + u8 flags; + u32 entry; + u32 id; +}; + +enum nldev_res_flags { + NLDEV_PER_DEV = 1 << 0, }; static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = { - .fill_res_func = fill_res_qp_entry, - .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, + .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { - .fill_res_func = fill_res_cm_id_entry, - .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, + .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { - .fill_res_func = fill_res_cq_entry, - .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { - .fill_res_func = fill_res_mr_entry, - .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { - .fill_res_func = fill_res_pd_entry, - .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_PDN, }, + [RDMA_RESTRACK_COUNTER] = { + .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, + .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, + .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, + }, + [RDMA_RESTRACK_CTX] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CTXN, + }, + [RDMA_RESTRACK_SRQ] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_SRQN, + }, + }; +static noinline_for_stack int +res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + enum rdma_restrack_type res_type, + res_fill_func_t fill_func) +{ + const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + struct ib_device *device; + u32 index, id, port = 0; + bool has_cap_net_admin; + struct sk_buff *msg; + int ret; + + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + } + + if ((port && fe->flags & NLDEV_PER_DEV) || + (!port && ~fe->flags & NLDEV_PER_DEV)) { + ret = -EINVAL; + goto err; + } + + id = nla_get_u32(tb[fe->id]); + res = rdma_restrack_get_byid(device, res_type, id); + if (IS_ERR(res)) { + ret = PTR_ERR(res); + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err_get; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(nlh->nlmsg_type)), + 0, 0); + + if (!nlh || fill_nldev_handle(msg, device)) { + ret = -EMSGSIZE; + goto err_free; + } + + has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); + + ret = fill_func(msg, has_cap_net_admin, res, port); + if (ret) + goto err_free; + + rdma_restrack_put(res); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err_get: + rdma_restrack_put(res); +err: + ib_device_put(device); + return ret; +} + static int res_get_common_dumpit(struct sk_buff *skb, struct netlink_callback *cb, - enum rdma_restrack_type res_type) + enum rdma_restrack_type res_type, + res_fill_func_t fill_func) { const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; + bool show_details = false; struct nlattr *table_attr; + struct nlattr *entry_attr; struct ib_device *device; int start = cb->args[0]; + bool has_cap_net_admin; struct nlmsghdr *nlh; + unsigned long id; u32 index, port = 0; bool filled = false; - err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); /* * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in @@ -968,10 +1586,13 @@ static int res_get_common_dumpit(struct sk_buff *skb, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; + if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]) + show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]); + /* * If no PORT_INDEX is supplied, we will return all QPs from that device */ @@ -984,69 +1605,67 @@ static int res_get_common_dumpit(struct sk_buff *skb, } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), 0, NLM_F_MULTI); - if (fill_nldev_handle(skb, device)) { + if (!nlh || fill_nldev_handle(skb, device)) { ret = -EMSGSIZE; goto err; } - table_attr = nla_nest_start(skb, fe->nldev_attr); + table_attr = nla_nest_start_noflag(skb, fe->nldev_attr); if (!table_attr) { ret = -EMSGSIZE; goto err; } - down_read(&device->res.rwsem); - hash_for_each_possible(device->res.hash, res, node, res_type) { - if (idx < start) - goto next; + has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); - if ((rdma_is_kernel_res(res) && - task_active_pid_ns(current) != &init_pid_ns) || - (!rdma_is_kernel_res(res) && task_active_pid_ns(current) != - task_active_pid_ns(res->task))) - /* - * 1. Kern resources should be visible in init - * namspace only - * 2. Present only resources visible in the current - * namespace - */ + rt = &device->res[res_type]; + xa_lock(&rt->xa); + /* + * FIXME: if the skip ahead is something common this loop should + * use xas_for_each & xas_pause to optimize, we can have a lot of + * objects. + */ + xa_for_each(&rt->xa, id, res) { + if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details) goto next; - if (!rdma_restrack_get(res)) - /* - * Resource is under release now, but we are not - * relesing lock now, so it will be released in - * our next pass, once we will get ->next pointer. - */ + if (idx < start || !rdma_restrack_get(res)) goto next; + xa_unlock(&rt->xa); + filled = true; - up_read(&device->res.rwsem); - ret = fe->fill_res_func(skb, cb, res, port); - down_read(&device->res.rwsem); - /* - * Return resource back, but it won't be released till - * the &device->res.rwsem will be released for write. - */ + entry_attr = nla_nest_start_noflag(skb, fe->entry); + if (!entry_attr) { + ret = -EMSGSIZE; + rdma_restrack_put(res); + goto msg_full; + } + + ret = fill_func(skb, has_cap_net_admin, res, port); + rdma_restrack_put(res); - if (ret == -EMSGSIZE) - /* - * There is a chance to optimize here. - * It can be done by using list_prepare_entry - * and list_for_each_entry_continue afterwards. - */ - break; - if (ret) + if (ret) { + nla_nest_cancel(skb, entry_attr); + if (ret == -EMSGSIZE) + goto msg_full; + if (ret == -EAGAIN) + goto again; goto res_err; + } + nla_nest_end(skb, entry_attr); +again: xa_lock(&rt->xa); next: idx++; } - up_read(&device->res.rwsem); + xa_unlock(&rt->xa); +msg_full: nla_nest_end(skb, table_attr); nlmsg_end(skb, nlh); cb->args[0] = idx; @@ -1063,7 +1682,6 @@ next: idx++; res_err: nla_nest_cancel(skb, table_attr); - up_read(&device->res.rwsem); err: nlmsg_cancel(skb, nlh); @@ -1073,34 +1691,950 @@ err_index: return ret; } -static int nldev_res_get_qp_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +#define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ + struct netlink_callback *cb) \ + { \ + return res_get_common_dumpit(skb, cb, type, \ + fill_res_##name##_entry); \ + } \ + static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ + struct nlmsghdr *nlh, \ + struct netlink_ext_ack *extack) \ + { \ + return res_get_common_doit(skb, nlh, extack, type, \ + fill_res_##name##_entry); \ + } + +RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); +RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); +RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); +RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); +RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); +RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); +RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX); +RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ); +RES_GET_FUNCS(srq_raw, RDMA_RESTRACK_SRQ); + +static LIST_HEAD(link_ops); +static DECLARE_RWSEM(link_ops_rwsem); + +static const struct rdma_link_ops *link_ops_get(const char *type) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP); + const struct rdma_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->type, type)) + goto out; + } + ops = NULL; +out: + return ops; } -static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +void rdma_link_register(struct rdma_link_ops *ops) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID); + down_write(&link_ops_rwsem); + if (WARN_ON_ONCE(link_ops_get(ops->type))) + goto out; + list_add(&ops->list, &link_ops); +out: + up_write(&link_ops_rwsem); } +EXPORT_SYMBOL(rdma_link_register); -static int nldev_res_get_cq_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +void rdma_link_unregister(struct rdma_link_ops *ops) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ); + down_write(&link_ops_rwsem); + list_del(&ops->list); + up_write(&link_ops_rwsem); } +EXPORT_SYMBOL(rdma_link_unregister); -static int nldev_res_get_mr_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR); + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char ndev_name[IFNAMSIZ]; + struct net_device *ndev; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + + nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) + return -EINVAL; + + nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); + + ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); + if (!ndev) + return -ENODEV; + + down_read(&link_ops_rwsem); + ops = link_ops_get(type); +#ifdef CONFIG_MODULES + if (!ops) { + up_read(&link_ops_rwsem); + request_module("rdma-link-%s", type); + down_read(&link_ops_rwsem); + ops = link_ops_get(type); + } +#endif + err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; + up_read(&link_ops_rwsem); + dev_put(ndev); + + return err; } -static int nldev_res_get_pd_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD); + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) { + ib_device_put(device); + return -EINVAL; + } + + ib_unregister_device_and_put(device); + return 0; +} + +static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; + struct ib_client_nl_info data = {}; + struct ib_device *ibdev = NULL; + struct sk_buff *msg; + u32 index; + int err; + + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + NL_VALIDATE_LIBERAL, extack); + if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) + return -EINVAL; + + nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)); + + if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + ibdev = ib_device_get_by_index(sock_net(skb->sk), index); + if (!ibdev) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(ibdev, data.port)) { + err = -EINVAL; + goto out_put; + } + } else { + data.port = -1; + } + } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out_put; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_GET_CHARDEV), + 0, 0); + if (!nlh) { + err = -EMSGSIZE; + goto out_nlmsg; + } + + data.nl_msg = msg; + err = ib_get_client_nl_info(ibdev, client_name, &data); + if (err) + goto out_nlmsg; + + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, + huge_encode_dev(data.cdev->devt), + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, + dev_name(data.cdev))) { + err = -EMSGSIZE; + goto out_data; + } + + nlmsg_end(msg, nlh); + put_device(data.cdev); + if (ibdev) + ib_device_put(ibdev); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +out_data: + put_device(data.cdev); +out_nlmsg: + nlmsg_free(msg); +out_put: + if (ibdev) + ib_device_put(ibdev); + return err; +} + +static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct sk_buff *msg; + int err; + + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_SYS_GET), + 0, 0); + if (!nlh) { + nlmsg_free(msg); + return -EMSGSIZE; + } + + err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, + (u8)ib_devices_shared_netns); + if (err) { + nlmsg_free(msg); + return err; + } + + err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE, + (u8)privileged_qkey); + if (err) { + nlmsg_free(msg); + return err; + } + + err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1); + if (err) { + nlmsg_free(msg); + return err; + } + /* + * Copy-on-fork is supported. + * See commits: + * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") + * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm") + * for more details. Don't backport this without them. + * + * Return value ignored on purpose, assume copy-on-fork is not + * supported in case of failure. + */ + nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1); + + nlmsg_end(msg, nlh); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); +} + +static int nldev_set_sys_set_netns_doit(struct nlattr *tb[]) +{ + u8 enable; + int err; + + enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]); + /* Only 0 and 1 are supported */ + if (enable > 1) + return -EINVAL; + + err = rdma_compatdev_set(enable); + return err; +} + +static int nldev_set_sys_set_pqkey_doit(struct nlattr *tb[]) +{ + u8 enable; + + enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]); + /* Only 0 and 1 are supported */ + if (enable > 1) + return -EINVAL; + + privileged_qkey = enable; + return 0; +} + +static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err) + return -EINVAL; + + if (tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]) + return nldev_set_sys_set_netns_doit(tb); + + if (tb[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE]) + return nldev_set_sys_set_pqkey_doit(tb); + + return -EINVAL; +} + + +static int nldev_stat_set_mode_doit(struct sk_buff *msg, + struct netlink_ext_ack *extack, + struct nlattr *tb[], + struct ib_device *device, u32 port) +{ + u32 mode, mask = 0, qpn, cntn = 0; + bool opcnt = false; + int ret; + + /* Currently only counter for QP is supported */ + if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || + nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]) + opcnt = !!nla_get_u8( + tb[RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED]); + + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); + if (mode == RDMA_COUNTER_MODE_AUTO) { + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32( + tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + return rdma_counter_set_auto_mode(device, port, mask, opcnt, + extack); + } + + if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + ret = rdma_counter_bind_qpn(device, port, qpn, cntn); + if (ret) + return ret; + } else { + ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn); + if (ret) + return ret; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + + return 0; + +err_fill: + rdma_counter_unbind_qpn(device, port, qpn, cntn); + return ret; +} + +static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[], + struct ib_device *device, + u32 port) +{ + struct rdma_hw_stats *stats; + struct nlattr *entry_attr; + unsigned long *target; + int rem, i, ret = 0; + u32 index; + + stats = ib_get_hw_stats_port(device, port); + if (!stats) + return -EINVAL; + + target = kcalloc(BITS_TO_LONGS(stats->num_counters), + sizeof(*stats->is_disabled), GFP_KERNEL); + if (!target) + return -ENOMEM; + + nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS], + rem) { + index = nla_get_u32(entry_attr); + if ((index >= stats->num_counters) || + !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) { + ret = -EINVAL; + goto out; + } + + set_bit(index, target); + } + + for (i = 0; i < stats->num_counters; i++) { + if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)) + continue; + + ret = rdma_counter_modify(device, port, i, test_bit(i, target)); + if (ret) + goto out; + } + +out: + kfree(target); + return ret; +} + +static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err_put_device; + } + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] && + !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { + ret = -EINVAL; + goto err_put_device; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err_put_device; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + if (!nlh || fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { + ret = -EMSGSIZE; + goto err_free_msg; + } + + if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) { + ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port); + if (ret) + goto err_free_msg; + } + + if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { + ret = nldev_stat_set_counter_dynamic_doit(tb, device, port); + if (ret) + goto err_free_msg; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_free_msg: + nlmsg_free(msg); +err_put_device: + ib_device_put(device); + return ret; +} + +static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port, qpn, cntn; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || + !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || + !tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_fill; + } + + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + + ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); + if (ret) + goto err_fill; + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_fill: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static noinline_for_stack int +stat_get_doit_default_counter(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr *tb[]) +{ + struct rdma_hw_stats *stats; + struct nlattr *table_attr; + struct ib_device *device; + int ret, num_cnts, i; + struct sk_buff *msg; + u32 index, port; + u64 v; + + if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (!device->ops.alloc_hw_port_stats || !device->ops.get_hw_stats) { + ret = -EINVAL; + goto err; + } + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + stats = ib_get_hw_stats_port(device, port); + if (!stats) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + + if (!nlh || fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { + ret = -EMSGSIZE; + goto err_msg; + } + + mutex_lock(&stats->lock); + + num_cnts = device->ops.get_hw_stats(device, stats, port, 0); + if (num_cnts < 0) { + ret = -EINVAL; + goto err_stats; + } + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) { + ret = -EMSGSIZE; + goto err_stats; + } + for (i = 0; i < num_cnts; i++) { + if (test_bit(i, stats->is_disabled)) + continue; + + v = stats->value[i] + + rdma_counter_get_hwstat_value(device, port, i); + if (rdma_nl_stat_hwcounter_entry(msg, + stats->descs[i].name, v)) { + ret = -EMSGSIZE; + goto err_table; + } + } + nla_nest_end(msg, table_attr); + + mutex_unlock(&stats->lock); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_table: + nla_nest_cancel(msg, table_attr); +err_stats: + mutex_unlock(&stats->lock); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static noinline_for_stack int +stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, struct nlattr *tb[]) + +{ + static enum rdma_nl_counter_mode mode; + static enum rdma_nl_counter_mask mask; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port; + bool opcnt; + int ret; + + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) + return nldev_res_get_counter_doit(skb, nlh, extack); + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + if (!nlh) { + ret = -EMSGSIZE; + goto err_msg; + } + + ret = rdma_counter_get_mode(device, port, &mode, &mask, &opcnt); + if (ret) + goto err_msg; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) { + ret = -EMSGSIZE; + goto err_msg; + } + + if ((mode == RDMA_COUNTER_MODE_AUTO) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { + ret = -EMSGSIZE; + goto err_msg; + } + + if ((mode == RDMA_COUNTER_MODE_AUTO) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_OPCOUNTER_ENABLED, opcnt)) { + ret = -EMSGSIZE; + goto err_msg; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); + if (ret) + return -EINVAL; + + if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) + return stat_get_doit_default_counter(skb, nlh, extack, tb); + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = stat_get_doit_qp(skb, nlh, extack, tb); + break; + case RDMA_NLDEV_ATTR_RES_MR: + ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR, + fill_stat_mr_entry); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int nldev_stat_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + return -EINVAL; + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = nldev_res_get_counter_dumpit(skb, cb); + break; + case RDMA_NLDEV_ATTR_RES_MR: + ret = res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR, + fill_stat_mr_entry); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; + struct rdma_hw_stats *stats; + struct ib_device *device; + struct sk_buff *msg; + u32 devid, port; + int ret, i; + + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), devid); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + stats = ib_get_hw_stats_port(device, port); + if (!stats) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put( + msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS), + 0, 0); + + ret = -EMSGSIZE; + if (!nlh || fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + goto err_msg; + + table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table) + goto err_msg; + + mutex_lock(&stats->lock); + for (i = 0; i < stats->num_counters; i++) { + entry = nla_nest_start(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry) + goto err_msg_table; + + if (nla_put_string(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + stats->descs[i].name) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i)) + goto err_msg_entry; + + if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) && + (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, + !test_bit(i, stats->is_disabled)))) + goto err_msg_entry; + + nla_nest_end(msg, entry); + } + mutex_unlock(&stats->lock); + + nla_nest_end(msg, table); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_msg_entry: + nla_nest_cancel(msg, entry); +err_msg_table: + mutex_unlock(&stats->lock); + nla_nest_cancel(msg, table); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int nldev_newdev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + enum rdma_nl_dev_type type; + struct ib_device *parent; + char name[IFNAMSIZ] = {}; + u32 parentid; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_DEV_TYPE]) + return -EINVAL; + + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(name)); + type = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_TYPE]); + parentid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + parent = ib_device_get_by_index(sock_net(skb->sk), parentid); + if (!parent) + return -EINVAL; + + ret = ib_add_sub_device(parent, type, name); + ib_device_put(parent); + + return ret; +} + +static int nldev_deldev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 devid; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), devid); + if (!device) + return -EINVAL; + + return ib_del_sub_device_and_put(device); } static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { @@ -1108,10 +2642,21 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_GET_CHARDEV] = { + .doit = nldev_get_chardev, + }, [RDMA_NLDEV_CMD_SET] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_NEWLINK] = { + .doit = nldev_newlink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELLINK] = { + .doit = nldev_dellink, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, @@ -1121,38 +2666,253 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .dump = nldev_res_get_dumpit, }, [RDMA_NLDEV_CMD_RES_QP_GET] = { + .doit = nldev_res_get_qp_doit, .dump = nldev_res_get_qp_dumpit, - /* - * .doit is not implemented yet for two reasons: - * 1. It is not needed yet. - * 2. There is a need to provide identifier, while it is easy - * for the QPs (device index + port index + LQPN), it is not - * the case for the rest of resources (PD and CQ). Because it - * is better to provide similar interface for all resources, - * let's wait till we will have other resources implemented - * too. - */ }, [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { + .doit = nldev_res_get_cm_id_doit, .dump = nldev_res_get_cm_id_dumpit, }, [RDMA_NLDEV_CMD_RES_CQ_GET] = { + .doit = nldev_res_get_cq_doit, .dump = nldev_res_get_cq_dumpit, }, [RDMA_NLDEV_CMD_RES_MR_GET] = { + .doit = nldev_res_get_mr_doit, .dump = nldev_res_get_mr_dumpit, }, [RDMA_NLDEV_CMD_RES_PD_GET] = { + .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, + [RDMA_NLDEV_CMD_RES_CTX_GET] = { + .doit = nldev_res_get_ctx_doit, + .dump = nldev_res_get_ctx_dumpit, + }, + [RDMA_NLDEV_CMD_RES_SRQ_GET] = { + .doit = nldev_res_get_srq_doit, + .dump = nldev_res_get_srq_dumpit, + }, + [RDMA_NLDEV_CMD_SYS_GET] = { + .doit = nldev_sys_get_doit, + }, + [RDMA_NLDEV_CMD_SYS_SET] = { + .doit = nldev_set_sys_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_STAT_SET] = { + .doit = nldev_stat_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_STAT_GET] = { + .doit = nldev_stat_get_doit, + .dump = nldev_stat_get_dumpit, + }, + [RDMA_NLDEV_CMD_STAT_DEL] = { + .doit = nldev_stat_del_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { + .doit = nldev_res_get_qp_raw_doit, + .dump = nldev_res_get_qp_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { + .doit = nldev_res_get_cq_raw_doit, + .dump = nldev_res_get_cq_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { + .doit = nldev_res_get_mr_raw_doit, + .dump = nldev_res_get_mr_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_SRQ_GET_RAW] = { + .doit = nldev_res_get_srq_raw_doit, + .dump = nldev_res_get_srq_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { + .doit = nldev_stat_get_counter_status_doit, + }, + [RDMA_NLDEV_CMD_NEWDEV] = { + .doit = nldev_newdev, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELDEV] = { + .doit = nldev_deldev, + .flags = RDMA_NL_ADMIN_PERM, + }, }; +static int fill_mon_netdev_rename(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = ib_device_get_netdev(device, port); + int ret = 0; + + if (!netdev || !net_eq(dev_net(netdev), net)) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); +out: + dev_put(netdev); + return ret; +} + +static int fill_mon_netdev_association(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = ib_device_get_netdev(device, port); + int ret = 0; + + if (netdev && !net_eq(dev_net(netdev), net)) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); + if (ret) + goto out; + + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev)); + if (ret) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); + if (ret) + goto out; + + if (netdev) { + ret = nla_put_u32(msg, + RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + + ret = nla_put_string(msg, + RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); + } + +out: + dev_put(netdev); + return ret; +} + +static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct net_device *netdev; + + switch (type) { + case RDMA_REGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor register device event\n"); + break; + case RDMA_UNREGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor unregister device event\n"); + break; + case RDMA_NETDEV_ATTACH_EVENT: + netdev = ib_device_get_netdev(device, port_num); + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", + port_num, netdev->ifindex); + dev_put(netdev); + break; + case RDMA_NETDEV_DETACH_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev detach event: port %d\n", + port_num); + break; + case RDMA_RENAME_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor rename device event\n"); + break; + + case RDMA_NETDEV_RENAME_EVENT: + netdev = ib_device_get_netdev(device, port_num); + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev rename event: port %d netdev %d\n", + port_num, netdev->ifindex); + dev_put(netdev); + break; + default: + break; + } +} + +int rdma_nl_notify_event(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct sk_buff *skb; + int ret = -EMSGSIZE; + struct net *net; + void *nlh; + + net = read_pnet(&device->coredev.rdma_net); + if (!net) + return -EINVAL; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + nlh = nlmsg_put(skb, 0, 0, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), + 0, 0); + if (!nlh) + goto err_free; + + switch (type) { + case RDMA_REGISTER_EVENT: + case RDMA_UNREGISTER_EVENT: + case RDMA_RENAME_EVENT: + ret = fill_nldev_handle(skb, device); + if (ret) + goto err_free; + break; + case RDMA_NETDEV_ATTACH_EVENT: + case RDMA_NETDEV_DETACH_EVENT: + ret = fill_mon_netdev_association(skb, device, port_num, net); + if (ret) + goto err_free; + break; + case RDMA_NETDEV_RENAME_EVENT: + ret = fill_mon_netdev_rename(skb, device, port_num, net); + if (ret) + goto err_free; + break; + default: + break; + } + + ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); + if (ret) + goto err_free; + + nlmsg_end(skb, nlh); + ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); + if (ret && ret != -ESRCH) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + goto err_free; + } + return 0; + +err_free: + rdma_nl_notify_err_msg(device, port_num, type); + nlmsg_free(skb); + return ret; +} + void __init nldev_init(void) { rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); } -void __exit nldev_exit(void) +void nldev_exit(void) { rdma_nl_unregister(RDMA_NL_NLDEV); } diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h index af4879bdf3d6..64e2822af70f 100644 --- a/drivers/infiniband/core/opa_smi.h +++ b/drivers/infiniband/core/opa_smi.h @@ -40,11 +40,11 @@ #include "smi.h" enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, - int port_num, int phys_port_cnt); + u32 port_num, int phys_port_cnt); int opa_smi_get_fwd_port(struct opa_smp *smp); extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp); extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, - bool is_switch, int port_num); + bool is_switch, u32 port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6c4747e61d2b..18918f463361 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -42,29 +42,24 @@ #include "core_priv.h" #include "rdma_core.h" -void uverbs_uobject_get(struct ib_uobject *uobject) -{ - kref_get(&uobject->ref); -} - static void uverbs_uobject_free(struct kref *ref) { - struct ib_uobject *uobj = - container_of(ref, struct ib_uobject, ref); - - if (uobj->uapi_object->type_class->needs_kfree_rcu) - kfree_rcu(uobj, rcu); - else - kfree(uobj); + kfree_rcu(container_of(ref, struct ib_uobject, ref), rcu); } +/* + * In order to indicate we no longer needs this uobject, uverbs_uobject_put + * is called. When the reference count is decreased, the uobject is freed. + * For example, this is used when attaching a completion channel to a CQ. + */ void uverbs_uobject_put(struct ib_uobject *uobject) { kref_put(&uobject->ref, uverbs_uobject_free); } +EXPORT_SYMBOL(uverbs_uobject_put); -static int uverbs_try_lock_object(struct ib_uobject *uobj, - enum rdma_lookup_mode mode) +int uverbs_try_lock_object(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) { /* * When a shared access is required, we use a positive counter. Each @@ -73,7 +68,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, * In exclusive access mode, we check that the counter is zero (nobody * claimed this object) and we set it to -1. Releasing a shared access * lock is done simply by decreasing the counter. As for exclusive - * access locks, since only a single one of them is is allowed + * access locks, since only a single one of them is allowed * concurrently, setting the counter to zero is enough for releasing * this lock. */ @@ -89,6 +84,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, } return 0; } +EXPORT_SYMBOL(uverbs_try_lock_object); static void assert_uverbs_usecnt(struct ib_uobject *uobj, enum rdma_lookup_mode mode) @@ -117,7 +113,7 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj, * however the type's allocat_commit function cannot have been called and the * uobject cannot be on the uobjects_lists * - * For RDMA_REMOVE_DESTROY the caller shold be holding a kref (eg via + * For RDMA_REMOVE_DESTROY the caller should be holding a kref (eg via * rdma_lookup_get_uobject) and the object is left in a state where the caller * needs to call rdma_lookup_put_uobject. * @@ -125,42 +121,36 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj, * and consumes the kref on the uobj. */ static int uverbs_destroy_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason reason) + enum rdma_remove_reason reason, + struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_file *ufile = uobj->ufile; + struct ib_uverbs_file *ufile = attrs->ufile; unsigned long flags; int ret; lockdep_assert_held(&ufile->hw_destroy_rwsem); assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE); - if (uobj->object) { - ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason); - if (ret) { - if (ib_is_destroy_retryable(ret, reason, uobj)) - return ret; - - /* Nothing to be done, dangle the memory and move on */ - WARN(true, - "ib_uverbs: failed to remove uobject id %d, driver err=%d", - uobj->id, ret); - } - - uobj->object = NULL; - } - if (reason == RDMA_REMOVE_ABORT) { WARN_ON(!list_empty(&uobj->list)); WARN_ON(!uobj->context); uobj->uapi_object->type_class->alloc_abort(uobj); + } else if (uobj->object) { + ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason, + attrs); + if (ret) + /* Nothing to be done, wait till ucontext will clean it */ + return ret; + + uobj->object = NULL; } uobj->context = NULL; /* - * For DESTROY the usecnt is held write locked, the caller is expected - * to put it unlock and put the object when done with it. Only DESTROY - * can remove the IDR handle. + * For DESTROY the usecnt is not changed, the caller is expected to + * manage it via uobj_put_destroy(). Only DESTROY can remove the IDR + * handle. */ if (reason != RDMA_REMOVE_DESTROY) atomic_set(&uobj->usecnt, 0); @@ -192,22 +182,29 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, /* * This calls uverbs_destroy_uobject() using the RDMA_REMOVE_DESTROY * sequence. It should only be used from command callbacks. On success the - * caller must pair this with rdma_lookup_put_uobject(LOOKUP_WRITE). This + * caller must pair this with uobj_put_destroy(). This * version requires the caller to have already obtained an * LOOKUP_DESTROY uobject kref. */ -int uobj_destroy(struct ib_uobject *uobj) +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_file *ufile = uobj->ufile; + struct ib_uverbs_file *ufile = attrs->ufile; int ret; down_read(&ufile->hw_destroy_rwsem); + /* + * Once the uobject is destroyed by RDMA_REMOVE_DESTROY then it is left + * write locked as the callers put it back with UVERBS_LOOKUP_DESTROY. + * This is because any other concurrent thread can still see the object + * in the xarray due to RCU. Leaving it locked ensures nothing else will + * touch it. + */ ret = uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE); if (ret) goto out_unlock; - ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY); + ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs); if (ret) { atomic_set(&uobj->usecnt, 0); goto out_unlock; @@ -221,21 +218,20 @@ out_unlock: /* * uobj_get_destroy destroys the HW object and returns a handle to the uobj * with a NULL object pointer. The caller must pair this with - * uverbs_put_destroy. + * uobj_put_destroy(). */ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, - u32 id, - const struct uverbs_attr_bundle *attrs) + u32 id, struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj; int ret; uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id, - UVERBS_LOOKUP_DESTROY); + UVERBS_LOOKUP_DESTROY, attrs); if (IS_ERR(uobj)) return uobj; - ret = uobj_destroy(uobj); + ret = uobj_destroy(uobj, attrs); if (ret) { rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY); return ERR_PTR(ret); @@ -249,28 +245,32 @@ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, * (negative errno on failure). For use by callers that do not need the uobj. */ int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, - const struct uverbs_attr_bundle *attrs) + struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj; uobj = __uobj_get_destroy(obj, id, attrs); if (IS_ERR(uobj)) return PTR_ERR(uobj); - - rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE); + uobj_put_destroy(uobj); return 0; } /* alloc_uobj must be undone by uverbs_destroy_uobject() */ -static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile, +static struct ib_uobject *alloc_uobj(struct uverbs_attr_bundle *attrs, const struct uverbs_api_object *obj) { + struct ib_uverbs_file *ufile = attrs->ufile; struct ib_uobject *uobj; - struct ib_ucontext *ucontext; - ucontext = ib_uverbs_get_ucontext_file(ufile); - if (IS_ERR(ucontext)) - return ERR_CAST(ucontext); + if (!attrs->context) { + struct ib_ucontext *ucontext = + ib_uverbs_get_ucontext_file(ufile); + + if (IS_ERR(ucontext)) + return ERR_CAST(ucontext); + attrs->context = ucontext; + } uobj = kzalloc(obj->type_attrs->obj_size, GFP_KERNEL); if (!uobj) @@ -280,7 +280,7 @@ static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile, * The object is added to the list in the commit stage. */ uobj->ufile = ufile; - uobj->context = ucontext; + uobj->context = attrs->context; INIT_LIST_HEAD(&uobj->list); uobj->uapi_object = obj; /* @@ -296,25 +296,13 @@ static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile, static int idr_add_uobj(struct ib_uobject *uobj) { - int ret; - - idr_preload(GFP_KERNEL); - spin_lock(&uobj->ufile->idr_lock); - - /* - * We start with allocating an idr pointing to NULL. This represents an - * object which isn't initialized yet. We'll replace it later on with - * the real object once we commit. - */ - ret = idr_alloc(&uobj->ufile->idr, NULL, 0, - min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT); - if (ret >= 0) - uobj->id = ret; - - spin_unlock(&uobj->ufile->idr_lock); - idr_preload_end(); - - return ret < 0 ? ret : 0; + /* + * We start with allocating an idr pointing to NULL. This represents an + * object which isn't initialized yet. We'll replace it later on with + * the real object once we commit. + */ + return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b, + GFP_KERNEL); } /* Returns the ib_uobject or an error. The caller should check for IS_ERR. */ @@ -324,29 +312,20 @@ lookup_get_idr_uobject(const struct uverbs_api_object *obj, enum rdma_lookup_mode mode) { struct ib_uobject *uobj; - unsigned long idrno = id; if (id < 0 || id > ULONG_MAX) return ERR_PTR(-EINVAL); rcu_read_lock(); - /* object won't be released as we're protected in rcu */ - uobj = idr_find(&ufile->idr, idrno); - if (!uobj) { - uobj = ERR_PTR(-ENOENT); - goto free; - } - /* * The idr_find is guaranteed to return a pointer to something that * isn't freed yet, or NULL, as the free after idr_remove goes through * kfree_rcu(). However the object may still have been released and * kfree() could be called at any time. */ - if (!kref_get_unless_zero(&uobj->ref)) + uobj = xa_load(&ufile->idr, id); + if (!uobj || !kref_get_unless_zero(&uobj->ref)) uobj = ERR_PTR(-ENOENT); - -free: rcu_read_unlock(); return uobj; } @@ -378,11 +357,11 @@ lookup_get_fd_uobject(const struct uverbs_api_object *obj, uobject = f->private_data; /* - * fget(id) ensures we are not currently running uverbs_close_fd, - * and the caller is expected to ensure that uverbs_close_fd is never - * done while a call top lookup is possible. + * fget(id) ensures we are not currently running + * uverbs_uobject_fd_release(), and the caller is expected to ensure + * that release is never done while a call to lookup is possible. */ - if (f->f_op != fd_type->fops) { + if (f->f_op != fd_type->fops || uobject->ufile != ufile) { fput(f); return ERR_PTR(-EBADF); } @@ -393,12 +372,13 @@ lookup_get_fd_uobject(const struct uverbs_api_object *obj, struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj, struct ib_uverbs_file *ufile, s64 id, - enum rdma_lookup_mode mode) + enum rdma_lookup_mode mode, + struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj; int ret; - if (IS_ERR(obj) && PTR_ERR(obj) == -ENOMSG) { + if (obj == ERR_PTR(-ENOMSG)) { /* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */ uobj = lookup_get_idr_uobject(NULL, ufile, id, mode); if (IS_ERR(uobj)) @@ -431,6 +411,8 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj, ret = uverbs_try_lock_object(uobj, mode); if (ret) goto free; + if (attrs) + attrs->context = uobj->context; return uobj; free: @@ -441,12 +423,12 @@ free: static struct ib_uobject * alloc_begin_idr_uobject(const struct uverbs_api_object *obj, - struct ib_uverbs_file *ufile) + struct uverbs_attr_bundle *attrs) { int ret; struct ib_uobject *uobj; - uobj = alloc_uobj(ufile, obj); + uobj = alloc_uobj(attrs, obj); if (IS_ERR(uobj)) return uobj; @@ -457,14 +439,12 @@ alloc_begin_idr_uobject(const struct uverbs_api_object *obj, ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device, RDMACG_RESOURCE_HCA_OBJECT); if (ret) - goto idr_remove; + goto remove; return uobj; -idr_remove: - spin_lock(&ufile->idr_lock); - idr_remove(&ufile->idr, uobj->id); - spin_unlock(&ufile->idr_lock); +remove: + xa_erase(&attrs->ufile->idr, uobj->id); uobj_put: uverbs_uobject_put(uobj); return ERR_PTR(ret); @@ -472,30 +452,54 @@ uobj_put: static struct ib_uobject * alloc_begin_fd_uobject(const struct uverbs_api_object *obj, - struct ib_uverbs_file *ufile) + struct uverbs_attr_bundle *attrs) { + const struct uverbs_obj_fd_type *fd_type; int new_fd; - struct ib_uobject *uobj; + struct ib_uobject *uobj, *ret; + struct file *filp; + + uobj = alloc_uobj(attrs, obj); + if (IS_ERR(uobj)) + return uobj; + + fd_type = + container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); + if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release && + fd_type->fops->release != &uverbs_async_event_release)) { + ret = ERR_PTR(-EINVAL); + goto err_fd; + } new_fd = get_unused_fd_flags(O_CLOEXEC); - if (new_fd < 0) - return ERR_PTR(new_fd); + if (new_fd < 0) { + ret = ERR_PTR(new_fd); + goto err_fd; + } - uobj = alloc_uobj(ufile, obj); - if (IS_ERR(uobj)) { - put_unused_fd(new_fd); - return uobj; + /* Note that uverbs_uobject_fd_release() is called during abort */ + filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, + fd_type->flags); + if (IS_ERR(filp)) { + ret = ERR_CAST(filp); + goto err_getfile; } + uobj->object = filp; uobj->id = new_fd; - uobj->ufile = ufile; - return uobj; + +err_getfile: + put_unused_fd(new_fd); +err_fd: + uverbs_uobject_put(uobj); + return ret; } struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, - struct ib_uverbs_file *ufile) + struct uverbs_attr_bundle *attrs) { + struct ib_uverbs_file *ufile = attrs->ufile; struct ib_uobject *ret; if (IS_ERR(obj)) @@ -509,7 +513,7 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, if (!down_read_trylock(&ufile->hw_destroy_rwsem)) return ERR_PTR(-EIO); - ret = obj->type_class->alloc_begin(obj, ufile); + ret = obj->type_class->alloc_begin(obj, attrs); if (IS_ERR(ret)) { up_read(&ufile->hw_destroy_rwsem); return ret; @@ -522,25 +526,19 @@ static void alloc_abort_idr_uobject(struct ib_uobject *uobj) ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, RDMACG_RESOURCE_HCA_OBJECT); - spin_lock(&uobj->ufile->idr_lock); - idr_remove(&uobj->ufile->idr, uobj->id); - spin_unlock(&uobj->ufile->idr_lock); + xa_erase(&uobj->ufile->idr, uobj->id); } static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { const struct uverbs_obj_idr_type *idr_type = container_of(uobj->uapi_object->type_attrs, struct uverbs_obj_idr_type, type); - int ret = idr_type->destroy_object(uobj, why); + int ret = idr_type->destroy_object(uobj, why, attrs); - /* - * We can only fail gracefully if the user requested to destroy the - * object or when a retry may be called upon an error. - * In the rest of the cases, just remove whatever you can. - */ - if (ib_is_destroy_retryable(ret, why, uobj)) + if (ret) return ret; if (why == RDMA_REMOVE_ABORT) @@ -554,28 +552,27 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, static void remove_handle_idr_uobject(struct ib_uobject *uobj) { - spin_lock(&uobj->ufile->idr_lock); - idr_remove(&uobj->ufile->idr, uobj->id); - spin_unlock(&uobj->ufile->idr_lock); + xa_erase(&uobj->ufile->idr, uobj->id); /* Matches the kref in alloc_commit_idr_uobject */ uverbs_uobject_put(uobj); } static void alloc_abort_fd_uobject(struct ib_uobject *uobj) { + struct file *filp = uobj->object; + + fput(filp); put_unused_fd(uobj->id); } static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { const struct uverbs_obj_fd_type *fd_type = container_of( uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); - int ret = fd_type->context_closed(uobj, why); - - if (ib_is_destroy_retryable(ret, why, uobj)) - return ret; + fd_type->destroy_object(uobj, why); return 0; } @@ -583,47 +580,49 @@ static void remove_handle_fd_uobject(struct ib_uobject *uobj) { } -static int alloc_commit_idr_uobject(struct ib_uobject *uobj) +static void alloc_commit_idr_uobject(struct ib_uobject *uobj) { struct ib_uverbs_file *ufile = uobj->ufile; + void *old; - spin_lock(&ufile->idr_lock); /* * We already allocated this IDR with a NULL object, so * this shouldn't fail. * - * NOTE: Once we set the IDR we loose ownership of our kref on uobj. + * NOTE: Storing the uobj transfers our kref on uobj to the XArray. * It will be put by remove_commit_idr_uobject() */ - WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id)); - spin_unlock(&ufile->idr_lock); - - return 0; + old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL); + WARN_ON(old != NULL); } -static int alloc_commit_fd_uobject(struct ib_uobject *uobj) +static void swap_idr_uobjects(struct ib_uobject *obj_old, + struct ib_uobject *obj_new) { - const struct uverbs_obj_fd_type *fd_type = container_of( - uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); - int fd = uobj->id; - struct file *filp; + struct ib_uverbs_file *ufile = obj_old->ufile; + void *old; /* - * The kref for uobj is moved into filp->private data and put in - * uverbs_close_fd(). Once alloc_commit() succeeds uverbs_close_fd() - * must be guaranteed to be called from the provided fops release - * callback. + * New must be an object that been allocated but not yet committed, this + * moves the pre-committed state to obj_old, new still must be comitted. */ - filp = anon_inode_getfile(fd_type->name, - fd_type->fops, - uobj, - fd_type->flags); - if (IS_ERR(filp)) - return PTR_ERR(filp); + old = xa_cmpxchg(&ufile->idr, obj_old->id, obj_old, XA_ZERO_ENTRY, + GFP_KERNEL); + if (WARN_ON(old != obj_old)) + return; - uobj->object = filp; + swap(obj_old->id, obj_new->id); + + old = xa_cmpxchg(&ufile->idr, obj_old->id, NULL, obj_old, GFP_KERNEL); + WARN_ON(old != NULL); +} + +static void alloc_commit_fd_uobject(struct ib_uobject *uobj) +{ + int fd = uobj->id; + struct file *filp = uobj->object; - /* Matching put will be done in uverbs_close_fd() */ + /* Matching put will be done in uverbs_uobject_fd_release() */ kref_get(&uobj->ufile->ref); /* This shouldn't be used anymore. Use the file object instead */ @@ -631,11 +630,10 @@ static int alloc_commit_fd_uobject(struct ib_uobject *uobj) /* * NOTE: Once we install the file we loose ownership of our kref on - * uobj. It will be put by uverbs_close_fd() + * uobj. It will be put by uverbs_uobject_fd_release() */ + filp->private_data = uobj; fd_install(fd, filp); - - return 0; } /* @@ -643,18 +641,10 @@ static int alloc_commit_fd_uobject(struct ib_uobject *uobj) * caller can no longer assume uobj is valid. If this function fails it * destroys the uboject, including the attached HW object. */ -int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj) +void rdma_alloc_commit_uobject(struct ib_uobject *uobj, + struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_file *ufile = uobj->ufile; - int ret; - - /* alloc_commit consumes the uobj kref */ - ret = uobj->uapi_object->type_class->alloc_commit(uobj); - if (ret) { - uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); - up_read(&ufile->hw_destroy_rwsem); - return ret; - } + struct ib_uverbs_file *ufile = attrs->ufile; /* kref is held so long as the uobj is on the uobj list. */ uverbs_uobject_get(uobj); @@ -665,22 +655,67 @@ int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj) /* matches atomic_set(-1) in alloc_uobj */ atomic_set(&uobj->usecnt, 0); + /* alloc_commit consumes the uobj kref */ + uobj->uapi_object->type_class->alloc_commit(uobj); + /* Matches the down_read in rdma_alloc_begin_uobject */ up_read(&ufile->hw_destroy_rwsem); +} - return 0; +/* + * new_uobj will be assigned to the handle currently used by to_uobj, and + * to_uobj will be destroyed. + * + * Upon return the caller must do: + * rdma_alloc_commit_uobject(new_uobj) + * uobj_put_destroy(to_uobj) + * + * to_uobj must have a write get but the put mode switches to destroy once + * this is called. + */ +void rdma_assign_uobject(struct ib_uobject *to_uobj, struct ib_uobject *new_uobj, + struct uverbs_attr_bundle *attrs) +{ + assert_uverbs_usecnt(new_uobj, UVERBS_LOOKUP_WRITE); + + if (WARN_ON(to_uobj->uapi_object != new_uobj->uapi_object || + !to_uobj->uapi_object->type_class->swap_uobjects)) + return; + + to_uobj->uapi_object->type_class->swap_uobjects(to_uobj, new_uobj); + + /* + * If this fails then the uobject is still completely valid (though with + * a new ID) and we leak it until context close. + */ + uverbs_destroy_uobject(to_uobj, RDMA_REMOVE_DESTROY, attrs); } /* * This consumes the kref for uobj. It is up to the caller to unwind the HW * object and anything else connected to uobj before calling this. */ -void rdma_alloc_abort_uobject(struct ib_uobject *uobj) +void rdma_alloc_abort_uobject(struct ib_uobject *uobj, + struct uverbs_attr_bundle *attrs, + bool hw_obj_valid) { struct ib_uverbs_file *ufile = uobj->ufile; + int ret; - uobj->object = NULL; - uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); + if (hw_obj_valid) { + ret = uobj->uapi_object->type_class->destroy_hw( + uobj, RDMA_REMOVE_ABORT, attrs); + /* + * If the driver couldn't destroy the object then go ahead and + * commit it. Leaking objects that can't be destroyed is only + * done during FD close after the driver has a few more tries to + * destroy it. + */ + if (WARN_ON(ret)) + return rdma_alloc_commit_uobject(uobj, attrs); + } + + uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs); /* Matches the down_read in rdma_alloc_begin_uobject */ up_read(&ufile->hw_destroy_rwsem); @@ -697,7 +732,10 @@ static void lookup_put_fd_uobject(struct ib_uobject *uobj, struct file *filp = uobj->object; WARN_ON(mode != UVERBS_LOOKUP_READ); - /* This indirectly calls uverbs_close_fd and free the object */ + /* + * This indirectly calls uverbs_uobject_fd_release() and free the + * object + */ fput(filp); } @@ -705,7 +743,6 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj, enum rdma_lookup_mode mode) { assert_uverbs_usecnt(uobj, mode); - uobj->uapi_object->type_class->lookup_put(uobj, mode); /* * In order to unlock an object, either decrease its usecnt for * read access or zero it in case of exclusive access. See @@ -722,35 +759,35 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj, break; } + uobj->uapi_object->type_class->lookup_put(uobj, mode); /* Pairs with the kref obtained by type->lookup_get */ uverbs_uobject_put(uobj); } void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile) { - spin_lock_init(&ufile->idr_lock); - idr_init(&ufile->idr); + xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC); } void release_ufile_idr_uobject(struct ib_uverbs_file *ufile) { struct ib_uobject *entry; - int id; + unsigned long id; /* * At this point uverbs_cleanup_ufile() is guaranteed to have run, and - * there are no HW objects left, however the IDR is still populated + * there are no HW objects left, however the xarray is still populated * with anything that has not been cleaned up by userspace. Since the * kref on ufile is 0, nothing is allowed to call lookup_get. * * This is an optimized equivalent to remove_handle_idr_uobject */ - idr_for_each_entry(&ufile->idr, entry, id) { + xa_for_each(&ufile->idr, id, entry) { WARN_ON(entry->object); uverbs_uobject_put(entry); } - idr_destroy(&ufile->idr); + xa_destroy(&ufile->idr); } const struct uverbs_obj_type_class uverbs_idr_class = { @@ -761,29 +798,33 @@ const struct uverbs_obj_type_class uverbs_idr_class = { .lookup_put = lookup_put_idr_uobject, .destroy_hw = destroy_hw_idr_uobject, .remove_handle = remove_handle_idr_uobject, - /* - * When we destroy an object, we first just lock it for WRITE and - * actually DESTROY it in the finalize stage. So, the problematic - * scenario is when we just started the finalize stage of the - * destruction (nothing was executed yet). Now, the other thread - * fetched the object for READ access, but it didn't lock it yet. - * The DESTROY thread continues and starts destroying the object. - * When the other thread continue - without the RCU, it would - * access freed memory. However, the rcu_read_lock delays the free - * until the rcu_read_lock of the READ operation quits. Since the - * exclusive lock of the object is still taken by the DESTROY flow, the - * READ operation will get -EBUSY and it'll just bail out. - */ - .needs_kfree_rcu = true, + .swap_uobjects = swap_idr_uobjects, }; EXPORT_SYMBOL(uverbs_idr_class); -void uverbs_close_fd(struct file *f) +/* + * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct + * file_operations release method. + */ +int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) { - struct ib_uobject *uobj = f->private_data; - struct ib_uverbs_file *ufile = uobj->ufile; + struct ib_uverbs_file *ufile; + struct ib_uobject *uobj; + + /* + * This can only happen if the fput came from alloc_abort_fd_uobject() + */ + if (!filp->private_data) + return 0; + uobj = filp->private_data; + ufile = uobj->ufile; if (down_read_trylock(&ufile->hw_destroy_rwsem)) { + struct uverbs_attr_bundle attrs = { + .context = uobj->context, + .ufile = ufile, + }; + /* * lookup_get_fd_uobject holds the kref on the struct file any * time a FD uobj is locked, which prevents this release @@ -791,16 +832,18 @@ void uverbs_close_fd(struct file *f) * write lock here, or we have a kernel bug. */ WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE)); - uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE); + uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs); up_read(&ufile->hw_destroy_rwsem); } - /* Matches the get in alloc_begin_fd_uobject */ + /* Matches the get in alloc_commit_fd_uobject() */ kref_put(&ufile->ref, ib_uverbs_release_file); /* Pairs with filp->private_data in alloc_begin_fd_uobject */ uverbs_uobject_put(uobj); + return 0; } +EXPORT_SYMBOL(uverbs_uobject_fd_release); /* * Drop the ucontext off the ufile and completely disconnect it from the @@ -811,7 +854,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, { struct ib_ucontext *ucontext = ufile->ucontext; struct ib_device *ib_dev = ucontext->device; - int ret; /* * If we are closing the FD then the user mmap VMAs must have @@ -829,12 +871,9 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, rdma_restrack_del(&ucontext->res); - /* - * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove - * the error return. - */ - ret = ib_dev->ops.dealloc_ucontext(ucontext); - WARN_ON(ret); + ib_dev->ops.dealloc_ucontext(ucontext); + WARN_ON(!xa_empty(&ucontext->mmap_xa)); + kfree(ucontext); ufile->ucontext = NULL; } @@ -842,9 +881,15 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { + struct uverbs_attr_bundle attrs = { .ufile = ufile }; + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; struct ib_uobject *obj, *next_obj; int ret = -EINVAL; + if (ib_dev->ops.ufile_hw_cleanup) + ib_dev->ops.ufile_hw_cleanup(ufile); + /* * This shouldn't run while executing other commands on this * context. Thus, the only thing we should take care of is @@ -855,23 +900,29 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, * other threads (which might still use the FDs) chance to run. */ list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) { + attrs.context = obj->context; /* * if we hit this WARN_ON, that means we are * racing with a lookup_get. */ WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE)); - if (!uverbs_destroy_uobject(obj, reason)) + if (reason == RDMA_REMOVE_DRIVER_FAILURE) + obj->object = NULL; + if (!uverbs_destroy_uobject(obj, reason, &attrs)) ret = 0; else atomic_set(&obj->usecnt, 0); } + + if (reason == RDMA_REMOVE_DRIVER_FAILURE) { + WARN_ON(!list_empty(&ufile->uobjects)); + return 0; + } return ret; } /* - * Destroy the uncontext and every uobject associated with it. If called with - * reason != RDMA_REMOVE_CLOSE this will not return until the destruction has - * been completed and ufile->ucontext is NULL. + * Destroy the ucontext and every uobject associated with it. * * This is internally locked and can be called in parallel from multiple * contexts. @@ -879,22 +930,6 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { - if (reason == RDMA_REMOVE_CLOSE) { - /* - * During destruction we might trigger something that - * synchronously calls release on any file descriptor. For - * this reason all paths that come from file_operations - * release must use try_lock. They can progress knowing that - * there is an ongoing uverbs_destroy_ufile_hw that will clean - * up the driver resources. - */ - if (!mutex_trylock(&ufile->ucontext_lock)) - return; - - } else { - mutex_lock(&ufile->ucontext_lock); - } - down_write(&ufile->hw_destroy_rwsem); /* @@ -904,26 +939,16 @@ void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, if (!ufile->ucontext) goto done; - ufile->ucontext->closing = true; - ufile->ucontext->cleanup_retryable = true; - while (!list_empty(&ufile->uobjects)) - if (__uverbs_cleanup_ufile(ufile, reason)) { - /* - * No entry was cleaned-up successfully during this - * iteration - */ - break; - } - - ufile->ucontext->cleanup_retryable = false; - if (!list_empty(&ufile->uobjects)) - __uverbs_cleanup_ufile(ufile, reason); + while (!list_empty(&ufile->uobjects) && + !__uverbs_cleanup_ufile(ufile, reason)) { + } + if (WARN_ON(!list_empty(&ufile->uobjects))) + __uverbs_cleanup_ufile(ufile, RDMA_REMOVE_DRIVER_FAILURE); ufile_destroy_ucontext(ufile, reason); done: up_write(&ufile->hw_destroy_rwsem); - mutex_unlock(&ufile->ucontext_lock); } const struct uverbs_obj_type_class uverbs_fd_class = { @@ -934,43 +959,39 @@ const struct uverbs_obj_type_class uverbs_fd_class = { .lookup_put = lookup_put_fd_uobject, .destroy_hw = destroy_hw_fd_uobject, .remove_handle = remove_handle_fd_uobject, - .needs_kfree_rcu = false, }; EXPORT_SYMBOL(uverbs_fd_class); struct ib_uobject * -uverbs_get_uobject_from_file(u16 object_id, - struct ib_uverbs_file *ufile, - enum uverbs_obj_access access, s64 id) +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, + s64 id, struct uverbs_attr_bundle *attrs) { const struct uverbs_api_object *obj = - uapi_get_object(ufile->device->uapi, object_id); + uapi_get_object(attrs->ufile->device->uapi, object_id); switch (access) { case UVERBS_ACCESS_READ: - return rdma_lookup_get_uobject(obj, ufile, id, - UVERBS_LOOKUP_READ); + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_READ, attrs); case UVERBS_ACCESS_DESTROY: /* Actual destruction is done inside uverbs_handle_method */ - return rdma_lookup_get_uobject(obj, ufile, id, - UVERBS_LOOKUP_DESTROY); + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_DESTROY, attrs); case UVERBS_ACCESS_WRITE: - return rdma_lookup_get_uobject(obj, ufile, id, - UVERBS_LOOKUP_WRITE); + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_WRITE, attrs); case UVERBS_ACCESS_NEW: - return rdma_alloc_begin_uobject(obj, ufile); + return rdma_alloc_begin_uobject(obj, attrs); default: WARN_ON(true); return ERR_PTR(-EOPNOTSUPP); } } -int uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, - bool commit) +void uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs) { - int ret = 0; - /* * refcounts should be handled at the object level and not at the * uobject level. Refcounts of the objects themselves are done in @@ -990,14 +1011,40 @@ int uverbs_finalize_object(struct ib_uobject *uobj, break; case UVERBS_ACCESS_NEW: if (commit) - ret = rdma_alloc_commit_uobject(uobj); + rdma_alloc_commit_uobject(uobj, attrs); else - rdma_alloc_abort_uobject(uobj); + rdma_alloc_abort_uobject(uobj, attrs, hw_obj_valid); break; default: WARN_ON(true); - ret = -EOPNOTSUPP; } +} - return ret; +/** + * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the + * uverbs attributes file has CAP_NET_RAW + * capability or not. + * + * @attrs: Pointer to uverbs attributes + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. + */ +bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_ucontext *ucontext; + bool has_cap = false; + int srcu_key; + + srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu); + ucontext = ib_uverbs_get_ucontext_file(ufile); + if (IS_ERR(ucontext)) + goto out; + has_cap = rdma_dev_has_raw_cap(ucontext->device); + +out: + srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key); + return has_cap; } +EXPORT_SYMBOL(rdma_uattrs_has_raw_cap); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 69f8db66925e..a59b087611cb 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -48,30 +48,7 @@ struct ib_uverbs_device; void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason); -int uobj_destroy(struct ib_uobject *uobj); - -/* - * uverbs_uobject_get is called in order to increase the reference count on - * an uobject. This is useful when a handler wants to keep the uobject's memory - * alive, regardless if this uobject is still alive in the context's objects - * repository. Objects are put via uverbs_uobject_put. - */ -void uverbs_uobject_get(struct ib_uobject *uobject); - -/* - * In order to indicate we no longer needs this uobject, uverbs_uobject_put - * is called. When the reference count is decreased, the uobject is freed. - * For example, this is used when attaching a completion channel to a CQ. - */ -void uverbs_uobject_put(struct ib_uobject *uobject); - -/* Indicate this fd is no longer used by this consumer, but its memory isn't - * necessarily released yet. When the last reference is put, we release the - * memory. After this call is executed, calling uverbs_uobject_get isn't - * allowed. - * This must be called from the release file_operations of the file! - */ -void uverbs_close_fd(struct file *f); +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs); /* * Get an ib_uobject that corresponds to the given id from ufile, assuming @@ -83,34 +60,20 @@ void uverbs_close_fd(struct file *f); * uverbs_finalize_objects are called. */ struct ib_uobject * -uverbs_get_uobject_from_file(u16 object_id, - struct ib_uverbs_file *ufile, - enum uverbs_obj_access access, s64 id); +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, + s64 id, struct uverbs_attr_bundle *attrs); -/* - * Note that certain finalize stages could return a status: - * (a) alloc_commit could return a failure if the object is committed at the - * same time when the context is destroyed. - * (b) remove_commit could fail if the object wasn't destroyed successfully. - * Since multiple objects could be finalized in one transaction, it is very NOT - * recommended to have several finalize actions which have side effects. - * For example, it's NOT recommended to have a certain action which has both - * a commit action and a destroy action or two destroy objects in the same - * action. The rule of thumb is to have one destroy or commit action with - * multiple lookups. - * The first non zero return value of finalize_object is returned from this - * function. For example, this could happen when we couldn't destroy an - * object. - */ -int uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, - bool commit); +void uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs); int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile); void release_ufile_idr_uobject(struct ib_uverbs_file *ufile); +struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs); + /* * This is the runtime description of the uverbs API, used by the syscall * machinery to validate and dispatch calls. @@ -188,13 +151,18 @@ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); +extern const struct uapi_definition uverbs_def_obj_async_fd[]; extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; +extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; +extern const struct uapi_definition uverbs_def_obj_qp[]; +extern const struct uapi_definition uverbs_def_obj_srq[]; +extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 46a5c553c624..b097cfcade1c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -6,117 +6,80 @@ #include <rdma/rdma_cm.h> #include <rdma/ib_verbs.h> #include <rdma/restrack.h> +#include <rdma/rdma_counter.h> #include <linux/mutex.h> #include <linux/sched/task.h> #include <linux/pid_namespace.h> #include "cma_priv.h" +#include "restrack.h" -static int fill_res_noop(struct sk_buff *msg, - struct rdma_restrack_entry *entry) +/** + * rdma_restrack_init() - initialize and allocate resource tracking + * @dev: IB device + * + * Return: 0 on success + */ +int rdma_restrack_init(struct ib_device *dev) { - return 0; -} + struct rdma_restrack_root *rt; + int i; -void rdma_restrack_init(struct rdma_restrack_root *res) -{ - init_rwsem(&res->rwsem); - res->fill_res_entry = fill_res_noop; + dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL); + if (!dev->res) + return -ENOMEM; + + rt = dev->res; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) + xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC); + + return 0; } -static const char *type2str(enum rdma_restrack_type type) -{ - static const char * const names[RDMA_RESTRACK_MAX] = { - [RDMA_RESTRACK_PD] = "PD", - [RDMA_RESTRACK_CQ] = "CQ", - [RDMA_RESTRACK_QP] = "QP", - [RDMA_RESTRACK_CM_ID] = "CM_ID", - [RDMA_RESTRACK_MR] = "MR", - [RDMA_RESTRACK_CTX] = "CTX", - }; - - return names[type]; -}; - -void rdma_restrack_clean(struct rdma_restrack_root *res) +/** + * rdma_restrack_clean() - clean resource tracking + * @dev: IB device + */ +void rdma_restrack_clean(struct ib_device *dev) { - struct rdma_restrack_entry *e; - char buf[TASK_COMM_LEN]; - struct ib_device *dev; - const char *owner; - int bkt; - - if (hash_empty(res->hash)) - return; + struct rdma_restrack_root *rt = dev->res; + int i; - dev = container_of(res, struct ib_device, res); - pr_err("restrack: %s", CUT_HERE); - dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); - hash_for_each(res->hash, bkt, e, node) { - if (rdma_is_kernel_res(e)) { - owner = e->kern_name; - } else { - /* - * There is no need to call get_task_struct here, - * because we can be here only if there are more - * get_task_struct() call than put_task_struct(). - */ - get_task_comm(buf, e->task); - owner = buf; - } + for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { + struct xarray *xa = &dev->res[i].xa; - pr_err("restrack: %s %s object allocated by %s is not freed\n", - rdma_is_kernel_res(e) ? "Kernel" : "User", - type2str(e->type), owner); + WARN_ON(!xa_empty(xa)); + xa_destroy(xa); } - pr_err("restrack: %s", CUT_HERE); + kfree(rt); } -int rdma_restrack_count(struct rdma_restrack_root *res, - enum rdma_restrack_type type, - struct pid_namespace *ns) +/** + * rdma_restrack_count() - the current usage of specific object + * @dev: IB device + * @type: actual type of object to operate + * @show_details: count driver specific objects + */ +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, + bool show_details) { + struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; + XA_STATE(xas, &rt->xa, 0); u32 cnt = 0; - down_read(&res->rwsem); - hash_for_each_possible(res->hash, e, node, type) { - if (ns == &init_pid_ns || - (!rdma_is_kernel_res(e) && - ns == task_active_pid_ns(e->task))) - cnt++; + xa_lock(&rt->xa); + xas_for_each(&xas, e, U32_MAX) { + if (xa_get_mark(&rt->xa, e->id, RESTRACK_DD) && !show_details) + continue; + cnt++; } - up_read(&res->rwsem); + xa_unlock(&rt->xa); return cnt; } EXPORT_SYMBOL(rdma_restrack_count); -static void set_kern_name(struct rdma_restrack_entry *res) -{ - struct ib_pd *pd; - - switch (res->type) { - case RDMA_RESTRACK_QP: - pd = container_of(res, struct ib_qp, res)->pd; - if (!pd) { - WARN_ONCE(true, "XRC QPs are not supported\n"); - /* Survive, despite the programmer's error */ - res->kern_name = " "; - } - break; - case RDMA_RESTRACK_MR: - pd = container_of(res, struct ib_mr, res)->pd; - break; - default: - /* Other types set kern_name directly */ - pd = NULL; - break; - } - - if (pd) - res->kern_name = pd->res.kern_name; -} - static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) { switch (res->type) { @@ -133,75 +96,136 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct ib_mr, res)->device; case RDMA_RESTRACK_CTX: return container_of(res, struct ib_ucontext, res)->device; + case RDMA_RESTRACK_COUNTER: + return container_of(res, struct rdma_counter, res)->device; + case RDMA_RESTRACK_SRQ: + return container_of(res, struct ib_srq, res)->device; + case RDMA_RESTRACK_DMAH: + return container_of(res, struct ib_dmah, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; } } -void rdma_restrack_set_task(struct rdma_restrack_entry *res, - const char *caller) +/** + * rdma_restrack_attach_task() - attach the task onto this resource, + * valid for user space restrack entries. + * @res: resource entry + * @task: the task to attach + */ +static void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task) { - if (caller) { - res->kern_name = caller; + if (WARN_ON_ONCE(!task)) return; - } if (res->task) put_task_struct(res->task); - get_task_struct(current); - res->task = current; + get_task_struct(task); + res->task = task; + res->user = true; } -EXPORT_SYMBOL(rdma_restrack_set_task); -static void rdma_restrack_add(struct rdma_restrack_entry *res) +/** + * rdma_restrack_set_name() - set the task for this resource + * @res: resource entry + * @caller: kernel name, the current task will be used if the caller is NULL. + */ +void rdma_restrack_set_name(struct rdma_restrack_entry *res, const char *caller) { - struct ib_device *dev = res_to_dev(res); - - if (!dev) + if (caller) { + res->kern_name = caller; return; - - if (res->type != RDMA_RESTRACK_CM_ID || rdma_is_kernel_res(res)) - res->task = NULL; - - if (!rdma_is_kernel_res(res)) { - if (!res->task) - rdma_restrack_set_task(res, NULL); - res->kern_name = NULL; - } else { - set_kern_name(res); } - kref_init(&res->kref); - init_completion(&res->comp); - res->valid = true; + rdma_restrack_attach_task(res, current); +} +EXPORT_SYMBOL(rdma_restrack_set_name); - down_write(&dev->res.rwsem); - hash_add(dev->res.hash, &res->node, res->type); - up_write(&dev->res.rwsem); +/** + * rdma_restrack_parent_name() - set the restrack name properties based + * on parent restrack + * @dst: destination resource entry + * @parent: parent resource entry + */ +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent) +{ + if (rdma_is_kernel_res(parent)) + dst->kern_name = parent->kern_name; + else + rdma_restrack_attach_task(dst, parent->task); } +EXPORT_SYMBOL(rdma_restrack_parent_name); /** - * rdma_restrack_kadd() - add kernel object to the reource tracking database - * @res: resource entry + * rdma_restrack_new() - Initializes new restrack entry to allow _put() interface + * to release memory in fully automatic way. + * @res: Entry to initialize + * @type: REstrack type */ -void rdma_restrack_kadd(struct rdma_restrack_entry *res) +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type) { - res->user = false; - rdma_restrack_add(res); + kref_init(&res->kref); + init_completion(&res->comp); + res->type = type; } -EXPORT_SYMBOL(rdma_restrack_kadd); +EXPORT_SYMBOL(rdma_restrack_new); /** - * rdma_restrack_uadd() - add user object to the reource tracking database + * rdma_restrack_add() - add object to the resource tracking database * @res: resource entry */ -void rdma_restrack_uadd(struct rdma_restrack_entry *res) +void rdma_restrack_add(struct rdma_restrack_entry *res) { - res->user = true; - rdma_restrack_add(res); + struct ib_device *dev = res_to_dev(res); + struct rdma_restrack_root *rt; + int ret = 0; + + if (!dev) + return; + + if (res->no_track) + goto out; + + rt = &dev->res[res->type]; + + if (res->type == RDMA_RESTRACK_QP) { + /* Special case to ensure that LQPN points to right QP */ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + + WARN_ONCE(qp->qp_num >> 24 || qp->port >> 8, + "QP number 0x%0X and port 0x%0X", qp->qp_num, + qp->port); + res->id = qp->qp_num; + if (qp->qp_type == IB_QPT_SMI || qp->qp_type == IB_QPT_GSI) + res->id |= qp->port << 24; + ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL); + if (ret) + res->id = 0; + + if (qp->qp_type >= IB_QPT_DRIVER) + xa_set_mark(&rt->xa, res->id, RESTRACK_DD); + } else if (res->type == RDMA_RESTRACK_COUNTER) { + /* Special case to ensure that cntn points to right counter */ + struct rdma_counter *counter; + + counter = container_of(res, struct rdma_counter, res); + ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL); + res->id = ret ? 0 : counter->id; + } else { + ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, + &rt->next_id, GFP_KERNEL); + ret = (ret < 0) ? ret : 0; + } + +out: + if (!ret) + res->valid = true; } -EXPORT_SYMBOL(rdma_restrack_uadd); +EXPORT_SYMBOL(rdma_restrack_add); int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) { @@ -209,11 +233,40 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_restrack_get); +/** + * rdma_restrack_get_byid() - translate from ID to restrack object + * @dev: IB device + * @type: resource track type + * @id: ID to take a look + * + * Return: Pointer to restrack entry or -ENOENT in case of error. + */ +struct rdma_restrack_entry * +rdma_restrack_get_byid(struct ib_device *dev, + enum rdma_restrack_type type, u32 id) +{ + struct rdma_restrack_root *rt = &dev->res[type]; + struct rdma_restrack_entry *res; + + xa_lock(&rt->xa); + res = xa_load(&rt->xa, id); + if (!res || !rdma_restrack_get(res)) + res = ERR_PTR(-ENOENT); + xa_unlock(&rt->xa); + + return res; +} +EXPORT_SYMBOL(rdma_restrack_get_byid); + static void restrack_release(struct kref *kref) { struct rdma_restrack_entry *res; res = container_of(kref, struct rdma_restrack_entry, kref); + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } complete(&res->comp); } @@ -223,30 +276,39 @@ int rdma_restrack_put(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_restrack_put); +/** + * rdma_restrack_del() - delete object from the resource tracking database + * @res: resource entry + */ void rdma_restrack_del(struct rdma_restrack_entry *res) { + struct rdma_restrack_entry *old; + struct rdma_restrack_root *rt; struct ib_device *dev; - if (!res->valid) + if (!res->valid) { + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } + return; + } + + if (res->no_track) goto out; dev = res_to_dev(res); - if (!dev) + if (WARN_ON(!dev)) return; - rdma_restrack_put(res); - - wait_for_completion(&res->comp); + rt = &dev->res[res->type]; - down_write(&dev->res.rwsem); - hash_del(&res->node); - res->valid = false; - up_write(&dev->res.rwsem); + old = xa_erase(&rt->xa, res->id); + WARN_ON(old != res); out: - if (res->task) { - put_task_struct(res->task); - res->task = NULL; - } + res->valid = false; + rdma_restrack_put(res); + wait_for_completion(&res->comp); } EXPORT_SYMBOL(rdma_restrack_del); diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h new file mode 100644 index 000000000000..6a04fc41f738 --- /dev/null +++ b/drivers/infiniband/core/restrack.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_CORE_RESTRACK_H_ +#define _RDMA_CORE_RESTRACK_H_ + +#include <linux/mutex.h> + +/** + * struct rdma_restrack_root - main resource tracking management + * entity, per-device + */ +struct rdma_restrack_root { + /** + * @xa: Array of XArray structure to hold restrack entries. + */ + struct xarray xa; + /** + * @next_id: Next ID to support cyclic allocation + */ + u32 next_id; +}; + +int rdma_restrack_init(struct ib_device *dev); +void rdma_restrack_clean(struct ib_device *dev); +void rdma_restrack_add(struct rdma_restrack_entry *res); +void rdma_restrack_del(struct rdma_restrack_entry *res); +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type); +void rdma_restrack_set_name(struct rdma_restrack_entry *res, + const char *caller); +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent); +#endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 558de0b9895c..a9f2c6b1b29e 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -70,7 +70,7 @@ struct netdev_event_work { }; static const struct { - bool (*is_supported)(const struct ib_device *device, u8 port_num); + bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, @@ -79,7 +79,7 @@ static const struct { #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) -unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; @@ -96,7 +96,7 @@ unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, - u8 port, union ib_gid *gid, + u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; @@ -144,7 +144,7 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool -is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port, +is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; @@ -168,7 +168,7 @@ is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port, } static bool -is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port, +is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; @@ -186,18 +186,19 @@ is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port, return res; } -/** is_ndev_for_default_gid_filter - Check if a given netdevice +/** + * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer - * @cookie_ndev: Netdevice to consider to form a default GID + * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool -is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port, +is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; @@ -223,13 +224,13 @@ is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port, return res; } -static bool pass_all_filter(struct ib_device *ib_dev, u8 port, +static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } -static bool upper_device_filter(struct ib_device *ib_dev, u8 port, +static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; @@ -249,7 +250,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u8 port, /** * is_upper_ndev_bond_master_filter - Check if a given netdevice - * is bond master device of netdevice of the the RDMA device of port. + * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice @@ -260,7 +261,7 @@ static bool upper_device_filter(struct ib_device *ib_dev, u8 port, * not have been established as slave device yet. */ static bool -is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port, +is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { @@ -280,7 +281,7 @@ is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port, static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, - u8 port, struct net_device *ndev, + u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; @@ -294,7 +295,7 @@ static void update_gid_ip(enum gid_op_type gid_op, } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, - u8 port, + u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { @@ -328,8 +329,9 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, - u8 port, struct net_device *ndev) + u32 port, struct net_device *ndev) { + const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; @@ -349,7 +351,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, return; } - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) @@ -359,7 +361,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } - endfor_ifa(in_dev); + rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { @@ -371,7 +373,7 @@ static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, - u8 port, struct net_device *ndev) + u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; @@ -416,7 +418,7 @@ static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, } } -static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, +static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); @@ -424,13 +426,13 @@ static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, enum_netdev_ipv6_ips(ib_dev, port, ndev); } -static void add_netdev_ips(struct ib_device *ib_dev, u8 port, +static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } -static void del_netdev_ips(struct ib_device *ib_dev, u8 port, +static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); @@ -445,7 +447,7 @@ static void del_netdev_ips(struct ib_device *ib_dev, u8 port, * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ -static void del_default_gids(struct ib_device *ib_dev, u8 port, +static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; @@ -457,7 +459,7 @@ static void del_default_gids(struct ib_device *ib_dev, u8 port, IB_CACHE_GID_DEFAULT_MODE_DELETE); } -static void add_default_gids(struct ib_device *ib_dev, u8 port, +static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; @@ -469,7 +471,7 @@ static void add_default_gids(struct ib_device *ib_dev, u8 port, } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, - u8 port, + u32 port, struct net_device *rdma_ndev, void *cookie) { @@ -504,7 +506,7 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * - * @device: the rdma device + * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { @@ -513,8 +515,29 @@ void rdma_roce_rescan_device(struct ib_device *ib_dev) } EXPORT_SYMBOL(rdma_roce_rescan_device); +/** + * rdma_roce_rescan_port - Rescan all of the network devices in the system + * and add their gids if relevant to the port of the RoCE device. + * + * @ib_dev: IB device + * @port: Port number + */ +void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) +{ + struct net_device *ndev = NULL; + + if (rdma_protocol_roce(ib_dev, port)) { + ndev = ib_device_get_netdev(ib_dev, port); + if (!ndev) + return; + enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); + dev_put(ndev); + } +} +EXPORT_SYMBOL(rdma_roce_rescan_port); + static void callback_for_addr_gid_device_scan(struct ib_device *device, - u8 port, + u32 port, struct net_device *rdma_ndev, void *cookie) { @@ -530,10 +553,11 @@ struct upper_list { struct net_device *upper; }; -static int netdev_upper_walk(struct net_device *upper, void *data) +static int netdev_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - struct list_head *upper_list = data; + struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; @@ -545,19 +569,21 @@ static int netdev_upper_walk(struct net_device *upper, void *data) return 0; } -static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, +static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, - u8 port, + u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; + struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); + priv.data = &upper_list; rcu_read_lock(); - netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list); + netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); @@ -570,25 +596,26 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, } } -static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, - struct net_device *event_ndev) +void roce_del_all_netdev_gids(struct ib_device *ib_dev, + u32 port, struct net_device *ndev) { - ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); + ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } +EXPORT_SYMBOL(roce_del_all_netdev_gids); -static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port, +static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { - handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); + handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } -static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port, +static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } -static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port, +static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { @@ -596,8 +623,7 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port, rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); - if (master_ndev) - dev_hold(master_ndev); + dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index d22c4a2ebac6..6354ddf2a274 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -1,15 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 HGST, a Western Digital Company. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ +#include <linux/memremap.h> #include <linux/moduleparam.h> #include <linux/slab.h> #include <linux/pci-p2pdma.h> @@ -28,14 +21,17 @@ module_param_named(force_mr, rdma_rw_force_mr, bool, 0); MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); /* - * Check if the device might use memory registration. This is currently only - * true for iWarp devices. In the future we can hopefully fine tune this based - * on HCA driver input. + * Report whether memory registration should be used. Memory registration must + * be used for iWarp devices because of iWARP-specific limitations. Memory + * registration is also enabled if registering memory might yield better + * performance than using multiple SGE entries, see rdma_rw_io_needs_mr() */ -static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) +static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num) { if (rdma_protocol_iwarp(dev, port_num)) return true; + if (dev->attrs.max_sgl_rd) + return true; if (unlikely(rdma_rw_force_mr)) return true; return false; @@ -43,34 +39,61 @@ static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) /* * Check if the device will use memory registration for this RW operation. - * We currently always use memory registrations for iWarp RDMA READs, and - * have a debug option to force usage of MRs. - * - * XXX: In the future we can hopefully fine tune this based on HCA driver - * input. + * For RDMA READs we must use MRs on iWarp and can optionally use them as an + * optimization otherwise. Additionally we have a debug option to force usage + * of MRs to help testing this code path. */ -static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, +static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num, enum dma_data_direction dir, int dma_nents) { - if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE) - return true; + if (dir == DMA_FROM_DEVICE) { + if (rdma_protocol_iwarp(dev, port_num)) + return true; + if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd) + return true; + } if (unlikely(rdma_rw_force_mr)) return true; return false; } -static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev, + bool pi_support) { + u32 max_pages; + + if (pi_support) + max_pages = dev->attrs.max_pi_fast_reg_page_list_len; + else + max_pages = dev->attrs.max_fast_reg_page_list_len; + /* arbitrary limit to avoid allocating gigantic resources */ - return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); + return min_t(u32, max_pages, 256); +} + +static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg) +{ + int count = 0; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + return count; } /* Caller must have zero-initialized *reg. */ -static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, +static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) { - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); u32 nents = min(sg_cnt, pages_per_mr); int count = 0, ret; @@ -78,14 +101,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, if (!reg->mr) return -EAGAIN; - if (reg->mr->need_inval) { - reg->inv_wr.opcode = IB_WR_LOCAL_INV; - reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; - reg->inv_wr.next = ®->reg_wr.wr; - count++; - } else { - reg->inv_wr.next = NULL; - } + count += rdma_rw_inv_key(reg); ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); if (ret < 0 || ret < nents) { @@ -106,14 +122,15 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, } static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct rdma_rw_reg_ctx *prev = NULL; - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); int i, j, ret = 0, count = 0; - ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr); ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); if (!ctx->reg) { ret = -ENOMEM; @@ -179,7 +196,6 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : qp->max_read_sge; struct ib_sge *sge; @@ -209,8 +225,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { - sge->addr = ib_sg_dma_address(dev, sg) + offset; - sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->addr = sg_dma_address(sg) + offset; + sge->length = sg_dma_len(sg) - offset; sge->lkey = qp->pd->local_dma_lkey; total_len += sge->length; @@ -236,14 +252,13 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; struct ib_rdma_wr *rdma_wr = &ctx->single.wr; ctx->nr_ops = 1; ctx->single.sge.lkey = qp->pd->local_dma_lkey; - ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; - ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + ctx->single.sge.addr = sg_dma_address(sg) + offset; + ctx->single.sge.length = sg_dma_len(sg) - offset; memset(rdma_wr, 0, sizeof(*rdma_wr)); if (dir == DMA_TO_DEVICE) @@ -274,27 +289,27 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, * Returns the number of WQEs that will be needed on the workqueue if * successful, or a negative error code. */ -int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, + }; int ret; - if (is_pci_p2pdma_page(sg_page(sg))) - ret = pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir); - else - ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); - - if (!ret) - return -ENOMEM; - sg_cnt = ret; + ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0); + if (ret) + return ret; + sg_cnt = sgt.nents; /* * Skip to the S/G entry that sg_offset falls into: */ for (;;) { - u32 len = ib_sg_dma_len(dev, sg); + u32 len = sg_dma_len(sg); if (sg_offset < len) break; @@ -324,7 +339,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, return ret; out_unmap_sg: - ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0); return ret; } EXPORT_SYMBOL(rdma_rw_ctx_init); @@ -347,96 +362,85 @@ EXPORT_SYMBOL(rdma_rw_ctx_init); * successful, or a negative error code. */ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, + }; + struct sg_table prot_sgt = { + .sgl = prot_sg, + .orig_nents = prot_sg_cnt, + }; struct ib_rdma_wr *rdma_wr; - struct ib_send_wr *prev_wr = NULL; int count = 0, ret; if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { - pr_err("SG count too large\n"); + pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n", + sg_cnt, prot_sg_cnt, pages_per_mr); return -EINVAL; } - ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); - if (!ret) - return -ENOMEM; - sg_cnt = ret; + ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0); + if (ret) + return ret; - ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); - if (!ret) { - ret = -ENOMEM; - goto out_unmap_sg; + if (prot_sg_cnt) { + ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0); + if (ret) + goto out_unmap_sg; } - prot_sg_cnt = ret; ctx->type = RDMA_RW_SIG_MR; ctx->nr_ops = 1; - ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL); - if (!ctx->sig) { + ctx->reg = kzalloc(sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { ret = -ENOMEM; goto out_unmap_prot_sg; } - ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0); - if (ret < 0) - goto out_free_ctx; - count += ret; - prev_wr = &ctx->sig->data.reg_wr.wr; - - ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot, - prot_sg, prot_sg_cnt, 0); - if (ret < 0) - goto out_destroy_data_mr; - count += ret; - - if (ctx->sig->prot.inv_wr.next) - prev_wr->next = &ctx->sig->prot.inv_wr; - else - prev_wr->next = &ctx->sig->prot.reg_wr.wr; - prev_wr = &ctx->sig->prot.reg_wr.wr; - - ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs); - if (!ctx->sig->sig_mr) { + ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->reg->mr) { ret = -EAGAIN; - goto out_destroy_prot_mr; + goto out_free_ctx; } - if (ctx->sig->sig_mr->need_inval) { - memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr)); + count += rdma_rw_inv_key(ctx->reg); - ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV; - ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey; + memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs)); - prev_wr->next = &ctx->sig->sig_inv_wr; - prev_wr = &ctx->sig->sig_inv_wr; + ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg, + prot_sgt.nents, NULL, SZ_4K); + if (unlikely(ret)) { + pr_err("failed to map PI sg (%u)\n", + sgt.nents + prot_sgt.nents); + goto out_destroy_sig_mr; } - ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR; - ctx->sig->sig_wr.wr.wr_cqe = NULL; - ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge; - ctx->sig->sig_wr.wr.num_sge = 1; - ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; - ctx->sig->sig_wr.sig_attrs = sig_attrs; - ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr; - if (prot_sg_cnt) - ctx->sig->sig_wr.prot = &ctx->sig->prot.sge; - prev_wr->next = &ctx->sig->sig_wr.wr; - prev_wr = &ctx->sig->sig_wr.wr; + ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY; + ctx->reg->reg_wr.wr.wr_cqe = NULL; + ctx->reg->reg_wr.wr.num_sge = 0; + ctx->reg->reg_wr.wr.send_flags = 0; + ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + ctx->reg->reg_wr.mr = ctx->reg->mr; + ctx->reg->reg_wr.key = ctx->reg->mr->lkey; count++; - ctx->sig->sig_sge.addr = 0; - ctx->sig->sig_sge.length = ctx->sig->data.sge.length; - if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE) - ctx->sig->sig_sge.length += ctx->sig->prot.sge.length; + ctx->reg->sge.addr = ctx->reg->mr->iova; + ctx->reg->sge.length = ctx->reg->mr->length; + if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE) + ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length; - rdma_wr = &ctx->sig->data.wr; - rdma_wr->wr.sg_list = &ctx->sig->sig_sge; + rdma_wr = &ctx->reg->wr; + rdma_wr->wr.sg_list = &ctx->reg->sge; rdma_wr->wr.num_sge = 1; rdma_wr->remote_addr = remote_addr; rdma_wr->rkey = rkey; @@ -444,23 +448,20 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; - prev_wr->next = &rdma_wr->wr; - prev_wr = &rdma_wr->wr; + ctx->reg->reg_wr.wr.next = &rdma_wr->wr; count++; return count; -out_destroy_prot_mr: - if (prot_sg_cnt) - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); -out_destroy_data_mr: - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); +out_destroy_sig_mr: + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); out_free_ctx: - kfree(ctx->sig); + kfree(ctx->reg); out_unmap_prot_sg: - ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); + if (prot_sgt.nents) + ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0); out_unmap_sg: - ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0); return ret; } EXPORT_SYMBOL(rdma_rw_ctx_signature_init); @@ -494,28 +495,13 @@ static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) * completion notification. */ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) + u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) { struct ib_send_wr *first_wr, *last_wr; int i; switch (ctx->type) { case RDMA_RW_SIG_MR: - rdma_rw_update_lkey(&ctx->sig->data, true); - if (ctx->sig->prot.mr) - rdma_rw_update_lkey(&ctx->sig->prot, true); - - ctx->sig->sig_mr->need_inval = true; - ib_update_fast_reg_key(ctx->sig->sig_mr, - ib_inc_rkey(ctx->sig->sig_mr->lkey)); - ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey; - - if (ctx->sig->data.inv_wr.next) - first_wr = &ctx->sig->data.inv_wr; - else - first_wr = &ctx->sig->data.reg_wr.wr; - last_wr = &ctx->sig->data.wr.wr; - break; case RDMA_RW_MR: for (i = 0; i < ctx->nr_ops; i++) { rdma_rw_update_lkey(&ctx->reg[i], @@ -566,7 +552,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_wrs); * is not set @cqe must be set so that the caller gets a completion * notification. */ -int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) { struct ib_send_wr *first_wr; @@ -585,8 +571,9 @@ EXPORT_SYMBOL(rdma_rw_ctx_post); * @sg_cnt: number of entries in @sg * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ */ -void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, - struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, + enum dma_data_direction dir) { int i; @@ -607,15 +594,13 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, break; } - /* P2PDMA contexts do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(sg))) - ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy); /** * rdma_rw_ctx_destroy_signature - release all resources allocated by - * rdma_rw_ctx_init_signature + * rdma_rw_ctx_signature_init * @ctx: context to release * @qp: queue pair to operate on * @port_num: port num to which the connection is bound @@ -626,23 +611,19 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy); * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ */ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, - u8 port_num, struct scatterlist *sg, u32 sg_cnt, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, enum dma_data_direction dir) { if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) return; - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); - ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); + kfree(ctx->reg); - if (ctx->sig->prot.mr) { - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); + if (prot_sg_cnt) ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); - } - - ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr); - kfree(ctx->sig); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); @@ -657,13 +638,13 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); * compute max_rdma_ctxts and the size of the transport's Send and * Send Completion Queues. */ -unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num, +unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, unsigned int maxpages) { unsigned int mr_pages; if (rdma_rw_can_use_mr(device, port_num)) - mr_pages = rdma_rw_fr_page_list_len(device); + mr_pages = rdma_rw_fr_page_list_len(device, false); else mr_pages = device->attrs.max_sge_rd; return DIV_ROUND_UP(maxpages, mr_pages); @@ -685,13 +666,12 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) factor = 1; /* - * If the devices needs MRs to perform RDMA READ or WRITE operations, + * If the device needs MRs to perform RDMA READ or WRITE operations, * we'll need two additional MRs for the registrations and the * invalidation. */ - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) - factor += 6; /* (inv + reg) * (data + prot + sig) */ - else if (rdma_rw_can_use_mr(dev, attr->port_num)) + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || + rdma_rw_can_use_mr(dev, attr->port_num)) factor += 2; /* inv + reg */ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; @@ -707,22 +687,24 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) { struct ib_device *dev = qp->pd->device; - u32 nr_mrs = 0, nr_sig_mrs = 0; + u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0; int ret = 0; - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) { + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) { nr_sig_mrs = attr->cap.max_rdma_ctxs; - nr_mrs = attr->cap.max_rdma_ctxs * 2; + nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, true); } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, false); } if (nr_mrs) { ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, IB_MR_TYPE_MEM_REG, - rdma_rw_fr_page_list_len(dev)); + max_num_sg, 0); if (ret) { - pr_err("%s: failed to allocated %d MRs\n", + pr_err("%s: failed to allocated %u MRs\n", __func__, nr_mrs); return ret; } @@ -730,10 +712,10 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) if (nr_sig_mrs) { ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, - IB_MR_TYPE_SIGNATURE, 2); + IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg); if (ret) { - pr_err("%s: failed to allocated %d SIG MRs\n", - __func__, nr_mrs); + pr_err("%s: failed to allocated %u SIG MRs\n", + __func__, nr_sig_mrs); goto out_free_rdma_mrs; } } diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index cbaaaa92fff3..143de37ae598 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -49,7 +49,7 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, u8 method, + struct ib_device *device, u32 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 97e6d7b69abf..c23e9c847314 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -32,7 +32,6 @@ * SOFTWARE. */ -#include <linux/module.h> #include <linux/init.h> #include <linux/err.h> #include <linux/random.h> @@ -40,7 +39,7 @@ #include <linux/slab.h> #include <linux/dma-mapping.h> #include <linux/kref.h> -#include <linux/idr.h> +#include <linux/xarray.h> #include <linux/workqueue.h> #include <uapi/linux/if_ether.h> #include <rdma/ib_pack.h> @@ -51,6 +50,7 @@ #include <rdma/ib_marshall.h> #include <rdma/ib_addr.h> #include <rdma/opa_addr.h> +#include <rdma/rdma_cm.h> #include "sa.h" #include "core_priv.h" @@ -95,17 +95,20 @@ struct ib_sa_port { struct delayed_work ib_cpi_work; spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; - u8 port_num; + u32 port_num; }; struct ib_sa_device { int start_port, end_port; struct ib_event_handler event_handler; - struct ib_sa_port port[0]; + struct ib_sa_port port[]; }; struct ib_sa_query { - void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *); + void (*callback)(struct ib_sa_query *sa_query, int status, + struct ib_sa_mad *mad); + void (*rmpp_callback)(struct ib_sa_query *sa_query, int status, + struct ib_mad_recv_wc *mad); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; @@ -123,14 +126,9 @@ struct ib_sa_query { #define IB_SA_CANCEL 0x00000002 #define IB_SA_QUERY_OPA 0x00000004 -struct ib_sa_service_query { - void (*callback)(int, struct ib_sa_service_rec *, void *); - void *context; - struct ib_sa_query sa_query; -}; - struct ib_sa_path_query { - void (*callback)(int, struct sa_path_rec *, void *); + void (*callback)(int status, struct sa_path_rec *rec, + unsigned int num_paths, void *context); void *context; struct ib_sa_query sa_query; struct sa_path_rec *conv_pr; @@ -154,6 +152,13 @@ struct ib_sa_mcmember_query { struct ib_sa_query sa_query; }; +struct ib_sa_service_query { + void (*callback)(int status, struct sa_service_rec *rec, + unsigned int num_services, void *context); + void *context; + struct ib_sa_query sa_query; +}; + static LIST_HEAD(ib_nl_request_list); static DEFINE_SPINLOCK(ib_nl_request_lock); static atomic_t ib_nl_sa_request_seq; @@ -174,7 +179,7 @@ static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = { }; -static void ib_sa_add_one(struct ib_device *device); +static int ib_sa_add_one(struct ib_device *device); static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { @@ -183,15 +188,14 @@ static struct ib_client sa_client = { .remove = ib_sa_remove_one }; -static DEFINE_SPINLOCK(idr_lock); -static DEFINE_IDR(query_idr); +static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct sa_path_rec, field), \ - .struct_size_bytes = sizeof((struct sa_path_rec *)0)->field, \ + .struct_size_bytes = sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { @@ -293,7 +297,7 @@ static const struct ib_field path_rec_table[] = { .struct_offset_bytes = \ offsetof(struct sa_path_rec, field), \ .struct_size_bytes = \ - sizeof((struct sa_path_rec *)0)->field, \ + sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field opa_path_rec_table[] = { @@ -421,7 +425,7 @@ static const struct ib_field opa_path_rec_table[] = { #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_sa_mcmember_rec, field), \ .field_name = "sa_mcmember_rec:" #field static const struct ib_field mcmember_rec_table[] = { @@ -503,57 +507,9 @@ static const struct ib_field mcmember_rec_table[] = { .size_bits = 23 }, }; -#define SERVICE_REC_FIELD(field) \ - .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \ - .field_name = "sa_service_rec:" #field - -static const struct ib_field service_rec_table[] = { - { SERVICE_REC_FIELD(id), - .offset_words = 0, - .offset_bits = 0, - .size_bits = 64 }, - { SERVICE_REC_FIELD(gid), - .offset_words = 2, - .offset_bits = 0, - .size_bits = 128 }, - { SERVICE_REC_FIELD(pkey), - .offset_words = 6, - .offset_bits = 0, - .size_bits = 16 }, - { SERVICE_REC_FIELD(lease), - .offset_words = 7, - .offset_bits = 0, - .size_bits = 32 }, - { SERVICE_REC_FIELD(key), - .offset_words = 8, - .offset_bits = 0, - .size_bits = 128 }, - { SERVICE_REC_FIELD(name), - .offset_words = 12, - .offset_bits = 0, - .size_bits = 64*8 }, - { SERVICE_REC_FIELD(data8), - .offset_words = 28, - .offset_bits = 0, - .size_bits = 16*8 }, - { SERVICE_REC_FIELD(data16), - .offset_words = 32, - .offset_bits = 0, - .size_bits = 8*16 }, - { SERVICE_REC_FIELD(data32), - .offset_words = 36, - .offset_bits = 0, - .size_bits = 4*32 }, - { SERVICE_REC_FIELD(data64), - .offset_words = 40, - .offset_bits = 0, - .size_bits = 2*64 }, -}; - #define CLASSPORTINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ - .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_class_port_info, field), \ .field_name = "ib_class_port_info:" #field static const struct ib_field ib_classport_info_rec_table[] = { @@ -631,7 +587,7 @@ static const struct ib_field ib_classport_info_rec_table[] = { .struct_offset_bytes =\ offsetof(struct opa_class_port_info, field), \ .struct_size_bytes = \ - sizeof((struct opa_class_port_info *)0)->field, \ + sizeof_field(struct opa_class_port_info, field), \ .field_name = "opa_class_port_info:" #field static const struct ib_field opa_classport_info_rec_table[] = { @@ -711,7 +667,7 @@ static const struct ib_field opa_classport_info_rec_table[] = { #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ - .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_sa_guidinfo_rec, field), \ .field_name = "sa_guidinfo_rec:" #field static const struct ib_field guidinfo_rec_table[] = { @@ -737,6 +693,60 @@ static const struct ib_field guidinfo_rec_table[] = { .size_bits = 512 }, }; +#define SERVICE_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct sa_service_rec, field), \ + .struct_size_bytes = sizeof_field(struct sa_service_rec, field), \ + .field_name = "sa_service_rec:" #field + +static const struct ib_field service_rec_table[] = { + { SERVICE_REC_FIELD(id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { SERVICE_REC_FIELD(gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(pkey), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 16 }, + { RESERVED, + .offset_words = 6, + .offset_bits = 16, + .size_bits = 16 }, + { SERVICE_REC_FIELD(lease), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 32 }, + { SERVICE_REC_FIELD(key), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(name), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 512 }, + { SERVICE_REC_FIELD(data_8), + .offset_words = 28, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_16), + .offset_words = 32, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_32), + .offset_words = 36, + .offset_bits = 0, + .size_bits = 128 }, + { SERVICE_REC_FIELD(data_64), + .offset_words = 40, + .offset_bits = 0, + .size_bits = 128 }, +}; + +#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 + static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) { query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; @@ -761,13 +771,14 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, dev_name(&query->port->agent->device->dev), - LS_DEVICE_NAME_MAX); + strscpy_pad(header->device_name, + dev_name(&query->port->agent->device->dev), + LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && sa_rec->reversible != 0) - query->path_use = LS_RESOLVE_PATH_USE_GMP; + query->path_use = LS_RESOLVE_PATH_USE_ALL; else query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL; header->path_use = query->path_use; @@ -830,13 +841,20 @@ static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask) return len; } -static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) +static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; void *data; struct ib_sa_mad *mad; int len; + unsigned long flags; + unsigned long delay; + gfp_t gfp_flag; + int ret; + + INIT_LIST_HEAD(&query->list); + query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); mad = query->mad_buf->mad; len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask); @@ -861,36 +879,25 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); -} + gfp_flag = ((gfp_mask & GFP_ATOMIC) == GFP_ATOMIC) ? GFP_ATOMIC : + GFP_NOWAIT; -static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) -{ - unsigned long flags; - unsigned long delay; - int ret; + spin_lock_irqsave(&ib_nl_request_lock, flags); + ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_flag); - INIT_LIST_HEAD(&query->list); - query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); + if (ret) + goto out; - /* Put the request on the list first.*/ - spin_lock_irqsave(&ib_nl_request_lock, flags); + /* Put the request on the list.*/ delay = msecs_to_jiffies(sa_local_svc_timeout_ms); query->timeout = delay + jiffies; list_add_tail(&query->list, &ib_nl_request_list); /* Start the timeout if this is the only request */ if (ib_nl_request_list.next == &query->list) queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); - spin_unlock_irqrestore(&ib_nl_request_lock, flags); - ret = ib_nl_send_msg(query, gfp_mask); - if (ret) { - ret = -EIO; - /* Remove the request */ - spin_lock_irqsave(&ib_nl_request_lock, flags); - list_del(&query->list); - spin_unlock_irqrestore(&ib_nl_request_lock, flags); - } +out: + spin_unlock_irqrestore(&ib_nl_request_lock, flags); return ret; } @@ -924,50 +931,77 @@ static void send_handler(struct ib_mad_agent *agent, static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, const struct nlmsghdr *nlh) { + struct sa_path_rec recs[RDMA_PRIMARY_PATH_MAX_REC_NUM]; + struct ib_sa_path_query *path_query; + struct ib_path_rec_data *rec_data; struct ib_mad_send_wc mad_send_wc; - struct ib_sa_mad *mad = NULL; const struct nlattr *head, *curr; - struct ib_path_rec_data *rec; - int len, rem; + struct ib_sa_mad *mad = NULL; + int len, rem, status = -EIO; + unsigned int num_prs = 0; u32 mask = 0; - int status = -EIO; - - if (query->callback) { - head = (const struct nlattr *) nlmsg_data(nlh); - len = nlmsg_len(nlh); - switch (query->path_use) { - case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: - mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; - break; - case LS_RESOLVE_PATH_USE_ALL: - case LS_RESOLVE_PATH_USE_GMP: - default: - mask = IB_PATH_PRIMARY | IB_PATH_GMP | - IB_PATH_BIDIRECTIONAL; - break; - } - nla_for_each_attr(curr, head, len, rem) { - if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) { - rec = nla_data(curr); - /* - * Get the first one. In the future, we may - * need to get up to 6 pathrecords. - */ - if ((rec->flags & mask) == mask) { - mad = query->mad_buf->mad; - mad->mad_hdr.method |= - IB_MGMT_METHOD_RESP; - memcpy(mad->data, rec->path_rec, - sizeof(rec->path_rec)); - status = 0; - break; - } - } + if (!query->callback) + goto out; + + path_query = container_of(query, struct ib_sa_path_query, sa_query); + mad = query->mad_buf->mad; + + head = (const struct nlattr *) nlmsg_data(nlh); + len = nlmsg_len(nlh); + switch (query->path_use) { + case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: + mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; + break; + + case LS_RESOLVE_PATH_USE_ALL: + mask = IB_PATH_PRIMARY; + break; + + case LS_RESOLVE_PATH_USE_GMP: + default: + mask = IB_PATH_PRIMARY | IB_PATH_GMP | + IB_PATH_BIDIRECTIONAL; + break; + } + + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD) + continue; + + rec_data = nla_data(curr); + if ((rec_data->flags & mask) != mask) + continue; + + if ((query->flags & IB_SA_QUERY_OPA) || + path_query->conv_pr) { + mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; + memcpy(mad->data, rec_data->path_rec, + sizeof(rec_data->path_rec)); + query->callback(query, 0, mad); + goto out; } - query->callback(query, status, mad); + + status = 0; + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + rec_data->path_rec, &recs[num_prs]); + recs[num_prs].flags = rec_data->flags; + recs[num_prs].rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(&recs[num_prs]); + + num_prs++; + if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM) + break; } + if (!status) { + mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; + path_query->callback(status, recs, num_prs, + path_query->context); + } else + query->callback(query, status, mad); + +out: mad_send_wc.send_buf = query->mad_buf; mad_send_wc.status = IB_WC_SUCCESS; send_handler(query->mad_buf->mad_agent, &mad_send_wc); @@ -1028,8 +1062,8 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, !(NETLINK_CB(skb).sk)) return -EPERM; - ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), - nlmsg_len(nlh), ib_nl_policy, NULL); + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy, NULL); attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT]; if (ret || !attr) goto settimeout_out; @@ -1040,6 +1074,8 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX) timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX; + spin_lock_irqsave(&ib_nl_request_lock, flags); + delta = timeout - sa_local_svc_timeout_ms; if (delta < 0) abs_delta = -delta; @@ -1047,7 +1083,6 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, abs_delta = delta; if (delta != 0) { - spin_lock_irqsave(&ib_nl_request_lock, flags); sa_local_svc_timeout_ms = timeout; list_for_each_entry(query, &ib_nl_request_list, list) { if (delta < 0 && abs_delta > query->timeout) @@ -1065,11 +1100,12 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, if (delay) mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, (unsigned long)delay); - spin_unlock_irqrestore(&ib_nl_request_lock, flags); } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + settimeout_out: - return skb->len; + return 0; } static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) @@ -1080,8 +1116,8 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) return 0; - ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), - nlmsg_len(nlh), ib_nl_policy, NULL); + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy, NULL); if (ret) return 0; @@ -1093,10 +1129,9 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct netlink_ext_ack *extack) { unsigned long flags; - struct ib_sa_query *query; + struct ib_sa_query *query = NULL, *iter; struct ib_mad_send_buf *send_buf; struct ib_mad_send_wc mad_send_wc; - int found = 0; int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || @@ -1104,20 +1139,21 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); - list_for_each_entry(query, &ib_nl_request_list, list) { + list_for_each_entry(iter, &ib_nl_request_list, list) { /* * If the query is cancelled, let the timeout routine * take care of it. */ - if (nlh->nlmsg_seq == query->seq) { - found = !ib_sa_query_cancelled(query); - if (found) - list_del(&query->list); + if (nlh->nlmsg_seq == iter->seq) { + if (!ib_sa_query_cancelled(iter)) { + list_del(&iter->list); + query = iter; + } break; } } - if (!found) { + if (!query) { spin_unlock_irqrestore(&ib_nl_request_lock, flags); goto resp_out; } @@ -1140,7 +1176,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, } resp_out: - return skb->len; + return 0; } static void free_sm_ah(struct kref *kref) @@ -1177,17 +1213,15 @@ EXPORT_SYMBOL(ib_sa_unregister_client); void ib_sa_cancel_query(int id, struct ib_sa_query *query) { unsigned long flags; - struct ib_mad_agent *agent; struct ib_mad_send_buf *mad_buf; - spin_lock_irqsave(&idr_lock, flags); - if (idr_find(&query_idr, id) != query) { - spin_unlock_irqrestore(&idr_lock, flags); + xa_lock_irqsave(&queries, flags); + if (xa_load(&queries, id) != query) { + xa_unlock_irqrestore(&queries, flags); return; } - agent = query->port->agent; mad_buf = query->mad_buf; - spin_unlock_irqrestore(&idr_lock, flags); + xa_unlock_irqrestore(&queries, flags); /* * If the query is still on the netlink request list, schedule @@ -1195,11 +1229,11 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) * sent to the MAD layer and has to be cancelled from there. */ if (!ib_nl_cancel_request(query)) - ib_cancel_mad(agent, mad_buf); + ib_cancel_mad(mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); -static u8 get_src_path_mask(struct ib_device *device, u8 port_num) +static u8 get_src_path_mask(struct ib_device *device, u32 port_num) { struct ib_sa_device *sa_dev; struct ib_sa_port *port; @@ -1218,7 +1252,7 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, +static int init_ah_attr_grh_fields(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *gid_attr) @@ -1247,7 +1281,7 @@ static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, * @port_num: Port on the specified device. * @rec: path record entry to use for ah attributes initialization. * @ah_attr: address handle attributes to initialization from path record. - * @sgid_attr: SGID attribute to consider during initialization. + * @gid_attr: SGID attribute to consider during initialization. * * When ib_init_ah_attr_from_path() returns success, * (a) for IB link layer it optionally contains a reference to SGID attribute @@ -1256,7 +1290,7 @@ static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, * User must invoke rdma_destroy_ah_attr() to release reference to SGID * attributes which are initialized using ib_init_ah_attr_from_path(). */ -int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, +int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *gid_attr) @@ -1363,23 +1397,23 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, gfp_t gfp_mask) { - bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; + const int nmbr_sa_query_retries = 10; - if (preload) - idr_preload(gfp_mask); - spin_lock_irqsave(&idr_lock, flags); - - id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); - - spin_unlock_irqrestore(&idr_lock, flags); - if (preload) - idr_preload_end(); - if (id < 0) - return id; + xa_lock_irqsave(&queries, flags); + ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask); + xa_unlock_irqrestore(&queries, flags); + if (ret < 0) + return ret; - query->mad_buf->timeout_ms = timeout_ms; + query->mad_buf->timeout_ms = timeout_ms / nmbr_sa_query_retries; + query->mad_buf->retries = nmbr_sa_query_retries; + if (!query->mad_buf->timeout_ms) { + /* Special case, very small timeout_ms */ + query->mad_buf->timeout_ms = 1; + query->mad_buf->retries = timeout_ms; + } query->mad_buf->context[0] = query; query->id = id; @@ -1394,9 +1428,9 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&query_idr, id); - spin_unlock_irqrestore(&idr_lock, flags); + xa_lock_irqsave(&queries, flags); + __xa_erase(&queries, id); + xa_unlock_irqrestore(&queries, flags); } /* @@ -1419,18 +1453,28 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) } EXPORT_SYMBOL(ib_sa_pack_path); +void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute) +{ + ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec, + attribute); +} +EXPORT_SYMBOL(ib_sa_pack_service); + +void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec) +{ + ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), attribute, + rec); +} +EXPORT_SYMBOL(ib_sa_unpack_service); + static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, - struct ib_device *device, - u8 port_num) + struct ib_sa_device *sa_dev, + u32 port_num) { - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; unsigned long flags; bool ret = false; - if (!sa_dev) - return ret; - port = &sa_dev->port[port_num - sa_dev->start_port]; spin_lock_irqsave(&port->classport_lock, flags); if (!port->classport_info.valid) @@ -1450,24 +1494,24 @@ enum opa_pr_supported { PR_IB_SUPPORTED }; -/** - * Check if current PR query can be an OPA query. - * Retuns PR_NOT_SUPPORTED if a path record query is not +/* + * opa_pr_query_possible - Check if current PR query can be an OPA query. + * + * Returns PR_NOT_SUPPORTED if a path record query is not * possible, PR_OPA_SUPPORTED if an OPA path record query * is possible and PR_IB_SUPPORTED if an IB path record * query is possible. */ static int opa_pr_query_possible(struct ib_sa_client *client, - struct ib_device *device, - u8 port_num, - struct sa_path_rec *rec) + struct ib_sa_device *sa_dev, + struct ib_device *device, u32 port_num) { struct ib_port_attr port_attr; if (ib_query_port(device, port_num, &port_attr)) return PR_NOT_SUPPORTED; - if (ib_sa_opa_pathrecord_support(client, device, port_num)) + if (ib_sa_opa_pathrecord_support(client, sa_dev, port_num)) return PR_OPA_SUPPORTED; if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) @@ -1477,40 +1521,101 @@ static int opa_pr_query_possible(struct ib_sa_client *client, } static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) + int status, struct ib_sa_mad *mad) { struct ib_sa_path_query *query = container_of(sa_query, struct ib_sa_path_query, sa_query); + struct sa_path_rec rec = {}; - if (mad) { - struct sa_path_rec rec; + if (!mad) { + query->callback(status, NULL, 0, query->context); + return; + } - if (sa_query->flags & IB_SA_QUERY_OPA) { - ib_unpack(opa_path_rec_table, - ARRAY_SIZE(opa_path_rec_table), - mad->data, &rec); - rec.rec_type = SA_PATH_REC_TYPE_OPA; - query->callback(status, &rec, query->context); - } else { - ib_unpack(path_rec_table, - ARRAY_SIZE(path_rec_table), - mad->data, &rec); - rec.rec_type = SA_PATH_REC_TYPE_IB; - sa_path_set_dmac_zero(&rec); + if (sa_query->flags & IB_SA_QUERY_OPA) { + ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_OPA; + query->callback(status, &rec, 1, query->context); + return; + } - if (query->conv_pr) { - struct sa_path_rec opa; + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(&rec); - memset(&opa, 0, sizeof(struct sa_path_rec)); - sa_convert_path_ib_to_opa(&opa, &rec); - query->callback(status, &opa, query->context); - } else { - query->callback(status, &rec, query->context); + if (query->conv_pr) { + struct sa_path_rec opa; + + memset(&opa, 0, sizeof(struct sa_path_rec)); + sa_convert_path_ib_to_opa(&opa, &rec); + query->callback(status, &opa, 1, query->context); + } else { + query->callback(status, &rec, 1, query->context); + } +} + +#define IB_SA_DATA_OFFS 56 +#define IB_SERVICE_REC_SZ 176 + +static void ib_unpack_service_rmpp(struct sa_service_rec *rec, + struct ib_mad_recv_wc *mad_wc, + int num_services) +{ + unsigned int cp_sz, data_i, data_size, rec_i = 0, buf_i = 0; + struct ib_mad_recv_buf *mad_buf; + u8 buf[IB_SERVICE_REC_SZ]; + u8 *data; + + data_size = sizeof(((struct ib_sa_mad *) mad_buf->mad)->data); + + list_for_each_entry(mad_buf, &mad_wc->rmpp_list, list) { + data = ((struct ib_sa_mad *) mad_buf->mad)->data; + data_i = 0; + while (data_i < data_size && rec_i < num_services) { + cp_sz = min(IB_SERVICE_REC_SZ - buf_i, + data_size - data_i); + memcpy(buf + buf_i, data + data_i, cp_sz); + data_i += cp_sz; + buf_i += cp_sz; + if (buf_i == IB_SERVICE_REC_SZ) { + ib_sa_unpack_service(buf, rec + rec_i); + buf_i = 0; + rec_i++; } } - } else - query->callback(status, NULL, query->context); + } +} + +static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, + struct ib_mad_recv_wc *mad_wc) +{ + struct ib_sa_service_query *query = + container_of(sa_query, struct ib_sa_service_query, sa_query); + struct sa_service_rec *rec; + int num_services; + + if (!mad_wc || !mad_wc->recv_buf.mad) { + query->callback(status, NULL, 0, query->context); + return; + } + + num_services = (mad_wc->mad_len - IB_SA_DATA_OFFS) / IB_SERVICE_REC_SZ; + if (!num_services) { + query->callback(-ENODATA, NULL, 0, query->context); + return; + } + + rec = kmalloc_array(num_services, sizeof(*rec), GFP_KERNEL); + if (!rec) { + query->callback(-ENOMEM, NULL, 0, query->context); + return; + } + + ib_unpack_service_rmpp(rec, mad_wc, num_services); + query->callback(status, rec, num_services, query->context); + kfree(rec); } static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) @@ -1522,6 +1627,14 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) kfree(query); } +static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) +{ + struct ib_sa_service_query *query = + container_of(sa_query, struct ib_sa_service_query, sa_query); + + kfree(query); +} + /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client @@ -1548,13 +1661,13 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) * the query. */ int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, - void *context), + unsigned int num_paths, void *context), void *context, struct ib_sa_query **sa_query) { @@ -1582,7 +1695,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, query->sa_query.port = port; if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { - status = opa_pr_query_possible(client, device, port_num, rec); + status = opa_pr_query_possible(client, sa_dev, device, port_num); if (status == PR_NOT_SUPPORTED) { ret = -EINVAL; goto err1; @@ -1652,89 +1765,61 @@ err1: } EXPORT_SYMBOL(ib_sa_path_rec_get); -static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) -{ - struct ib_sa_service_query *query = - container_of(sa_query, struct ib_sa_service_query, sa_query); - - if (mad) { - struct ib_sa_service_rec rec; - - ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), - mad->data, &rec); - query->callback(status, &rec, query->context); - } else - query->callback(status, NULL, query->context); -} - -static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) -{ - kfree(container_of(sa_query, struct ib_sa_service_query, sa_query)); -} - /** - * ib_sa_service_rec_query - Start Service Record operation - * @client:SA client - * @device:device to send request on - * @port_num: port number to send request on - * @method:SA method - should be get, set, or delete - * @rec:Service Record to send in request - * @comp_mask:component mask to send in request - * @timeout_ms:time to wait for response - * @gfp_mask:GFP mask to use for internal allocations - * @callback:function called when request completes, times out or is + * ib_sa_service_rec_get - Start a Service get query + * @client: SA client + * @device: device to send query on + * @port_num: port number to send query on + * @rec: Service Record to send in query + * @comp_mask: component mask to send in query + * @timeout_ms: time to wait for response + * @gfp_mask: GFP mask to use for internal allocations + * @callback: function called when query completes, times out or is * canceled - * @context:opaque user context passed to callback - * @sa_query:request context, used to cancel request + * @context: opaque user context passed to callback + * @sa_query: query context, used to cancel query * - * Send a Service Record set/get/delete to the SA to register, - * unregister or query a service record. - * The callback function will be called when the request completes (or + * Send a Service Record Get query to the SA to look up a path. The + * callback function will be called when the query completes (or * fails); status is 0 for a successful response, -EINTR if the query * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error * occurred sending the query. The resp parameter of the callback is * only valid if status is 0. * - * If the return value of ib_sa_service_rec_query() is negative, it is an - * error code. Otherwise it is a request ID that can be used to cancel + * If the return value of ib_sa_service_rec_get() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel * the query. */ -int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, u8 method, - struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, - unsigned long timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_service_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) +int ib_sa_service_rec_get(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct sa_service_rec *resp, + unsigned int num_services, + void *context), + void *context, struct ib_sa_query **sa_query) { - struct ib_sa_service_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; + struct ib_sa_service_query *query; struct ib_mad_agent *agent; + struct ib_sa_port *port; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; - port = &sa_dev->port[port_num - sa_dev->start_port]; + port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - if (method != IB_MGMT_METHOD_GET && - method != IB_MGMT_METHOD_SET && - method != IB_SA_METHOD_DELETE) - return -EINVAL; - query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; - query->sa_query.port = port; + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; @@ -1747,16 +1832,17 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, mad = query->sa_query.mad_buf->mad; init_mad(&query->sa_query, agent); - query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL; - query->sa_query.release = ib_sa_service_rec_release; - mad->mad_hdr.method = method; - mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); - mad->sa_hdr.comp_mask = comp_mask; + query->sa_query.rmpp_callback = callback ? ib_sa_service_rec_callback : + NULL; + query->sa_query.release = ib_sa_service_rec_release; + mad->mad_hdr.method = IB_MGMT_METHOD_GET_TABLE; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); + mad->sa_hdr.comp_mask = comp_mask; - ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), - rec, mad->data); + ib_sa_pack_service(rec, mad->data); *sa_query = &query->sa_query; + query->sa_query.mad_buf->context[1] = rec; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) @@ -1768,16 +1854,14 @@ err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); - err1: kfree(query); return ret; } -EXPORT_SYMBOL(ib_sa_service_rec_query); +EXPORT_SYMBOL(ib_sa_service_rec_get); static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) + int status, struct ib_sa_mad *mad) { struct ib_sa_mcmember_query *query = container_of(sa_query, struct ib_sa_mcmember_query, sa_query); @@ -1798,7 +1882,7 @@ static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, @@ -1868,8 +1952,7 @@ err1: /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) + int status, struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); @@ -1890,7 +1973,7 @@ static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) } int ib_sa_guid_info_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, + struct ib_device *device, u32 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, unsigned long timeout_ms, gfp_t gfp_mask, @@ -1965,30 +2048,6 @@ err1: } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); -bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, - struct ib_device *device, - u8 port_num) -{ - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; - bool ret = false; - unsigned long flags; - - if (!sa_dev) - return ret; - - port = &sa_dev->port[port_num - sa_dev->start_port]; - - spin_lock_irqsave(&port->classport_lock, flags); - if ((port->classport_info.valid) && - (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_IB)) - ret = ib_get_cpi_capmask2(&port->classport_info.data.ib) - & IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT; - spin_unlock_irqrestore(&port->classport_lock, flags); - return ret; -} -EXPORT_SYMBOL(ib_sa_sendonly_fullmem_support); - struct ib_classport_info_context { struct completion done; struct ib_sa_query *sa_query; @@ -2002,8 +2061,7 @@ static void ib_classportinfo_cb(void *context) } static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) + int status, struct ib_sa_mad *mad) { unsigned long flags; struct ib_sa_classport_info_query *query = @@ -2171,26 +2229,32 @@ static void send_handler(struct ib_mad_agent *agent, { struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; unsigned long flags; + int status = 0; - if (query->callback) + if (query->callback || query->rmpp_callback) { switch (mad_send_wc->status) { case IB_WC_SUCCESS: /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: - query->callback(query, -ETIMEDOUT, NULL); + status = -ETIMEDOUT; break; case IB_WC_WR_FLUSH_ERR: - query->callback(query, -EINTR, NULL); + status = -EINTR; break; default: - query->callback(query, -EIO, NULL); + status = -EIO; break; } - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&query_idr, query->id); - spin_unlock_irqrestore(&idr_lock, flags); + if (status) + query->callback ? query->callback(query, status, NULL) : + query->rmpp_callback(query, status, NULL); + } + + xa_lock_irqsave(&queries, flags); + __xa_erase(&queries, query->id); + xa_unlock_irqrestore(&queries, flags); free_mad(query); if (query->client) @@ -2203,17 +2267,25 @@ static void recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; + struct ib_mad *mad; + if (!send_buf) return; query = send_buf->context[0]; - if (query->callback) { + mad = mad_recv_wc->recv_buf.mad; + + if (query->rmpp_callback) { + if (mad_recv_wc->wc->status == IB_WC_SUCCESS) + query->rmpp_callback(query, mad->mad_hdr.status ? + -EINVAL : 0, mad_recv_wc); + else + query->rmpp_callback(query, -EIO, NULL); + } else if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) - query->callback(query, - mad_recv_wc->recv_buf.mad->mad_hdr.status ? - -EINVAL : 0, - (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); + query->callback(query, mad->mad_hdr.status ? + -EINVAL : 0, (struct ib_sa_mad *)mad); else query->callback(query, -EIO, NULL); } @@ -2303,7 +2375,7 @@ static void ib_sa_event(struct ib_event_handler *handler, unsigned long flags; struct ib_sa_device *sa_dev = container_of(handler, typeof(*sa_dev), event_handler); - u8 port_num = event->element.port_num - sa_dev->start_port; + u32 port_num = event->element.port_num - sa_dev->start_port; struct ib_sa_port *port = &sa_dev->port[port_num]; if (!rdma_cap_ib_sa(handler->device, port->port_num)) @@ -2333,20 +2405,21 @@ static void ib_sa_event(struct ib_event_handler *handler, } } -static void ib_sa_add_one(struct ib_device *device) +static int ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; int s, e, i; int count = 0; + int ret; s = rdma_start_port(device); e = rdma_end_port(device); - sa_dev = kzalloc(sizeof *sa_dev + - (e - s + 1) * sizeof (struct ib_sa_port), + sa_dev = kzalloc(struct_size(sa_dev, port, + size_add(size_sub(e, s), 1)), GFP_KERNEL); if (!sa_dev) - return; + return -ENOMEM; sa_dev->start_port = s; sa_dev->end_port = e; @@ -2364,10 +2437,13 @@ static void ib_sa_add_one(struct ib_device *device) sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, - NULL, 0, send_handler, - recv_handler, sa_dev, 0); - if (IS_ERR(sa_dev->port[i].agent)) + NULL, IB_MGMT_RMPP_VERSION, + send_handler, recv_handler, + sa_dev, 0); + if (IS_ERR(sa_dev->port[i].agent)) { + ret = PTR_ERR(sa_dev->port[i].agent); goto err; + } INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work, @@ -2376,8 +2452,10 @@ static void ib_sa_add_one(struct ib_device *device) count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; + } ib_set_client_data(device, &sa_client, sa_dev); @@ -2396,7 +2474,7 @@ static void ib_sa_add_one(struct ib_device *device) update_sm_ah(&sa_dev->port[i].update_task); } - return; + return 0; err: while (--i >= 0) { @@ -2405,7 +2483,7 @@ err: } free: kfree(sa_dev); - return; + return ret; } static void ib_sa_remove_one(struct ib_device *device, void *client_data) @@ -2413,9 +2491,6 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data) struct ib_sa_device *sa_dev = client_data; int i; - if (!sa_dev) - return; - ib_unregister_event_handler(&sa_dev->event_handler); flush_workqueue(ib_wq); @@ -2473,9 +2548,8 @@ err1: void ib_sa_cleanup(void) { cancel_delayed_work(&ib_nl_timed_work); - flush_workqueue(ib_nl_wq); destroy_workqueue(ib_nl_wq); mcast_cleanup(); ib_unregister_client(&sa_client); - idr_destroy(&query_idr); + WARN_ON(!xa_empty(&queries)); } diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 1efadbccf394..3512c2e54efc 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -39,22 +39,25 @@ #include "core_priv.h" #include "mad_priv.h" +static LIST_HEAD(mad_agent_list); +/* Lock to protect mad_agent_list */ +static DEFINE_SPINLOCK(mad_agent_list_lock); + static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) { struct pkey_index_qp_list *pkey = NULL; struct pkey_index_qp_list *tmp_pkey; struct ib_device *dev = pp->sec->dev; - spin_lock(&dev->port_pkey_list[pp->port_num].list_lock); - list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[pp->port_num].pkey_list, - pkey_index_list) { + spin_lock(&dev->port_data[pp->port_num].pkey_list_lock); + list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list, + pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { pkey = tmp_pkey; break; } } - spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock); + spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock); return pkey; } @@ -69,7 +72,7 @@ static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp, if (ret) return ret; - ret = ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix); + ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix); return ret; } @@ -190,7 +193,7 @@ static void qp_to_error(struct ib_qp_security *sec) static inline void check_pkey_qps(struct pkey_index_qp_list *pkey, struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix) { struct ib_port_pkey *pp, *tmp_pp; @@ -242,7 +245,7 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) struct pkey_index_qp_list *tmp_pkey; struct pkey_index_qp_list *pkey; struct ib_device *dev; - u8 port_num = pp->port_num; + u32 port_num = pp->port_num; int ret = 0; if (pp->state != IB_PORT_PKEY_VALID) @@ -259,12 +262,12 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) if (!pkey) return -ENOMEM; - spin_lock(&dev->port_pkey_list[port_num].list_lock); + spin_lock(&dev->port_data[port_num].pkey_list_lock); /* Check for the PKey again. A racing process may * have created it. */ list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[port_num].pkey_list, + &dev->port_data[port_num].pkey_list, pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { kfree(pkey); @@ -279,9 +282,9 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) spin_lock_init(&pkey->qp_list_lock); INIT_LIST_HEAD(&pkey->qp_list); list_add(&pkey->pkey_index_list, - &dev->port_pkey_list[port_num].pkey_list); + &dev->port_data[port_num].pkey_list); } - spin_unlock(&dev->port_pkey_list[port_num].list_lock); + spin_unlock(&dev->port_data[port_num].pkey_list_lock); } spin_lock(&pkey->qp_list_lock); @@ -336,27 +339,20 @@ static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp, if (!new_pps) return NULL; - if (qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) { - if (!qp_pps) { - new_pps->main.port_num = qp_attr->port_num; - new_pps->main.pkey_index = qp_attr->pkey_index; - } else { - new_pps->main.port_num = (qp_attr_mask & IB_QP_PORT) ? - qp_attr->port_num : - qp_pps->main.port_num; - - new_pps->main.pkey_index = - (qp_attr_mask & IB_QP_PKEY_INDEX) ? - qp_attr->pkey_index : - qp_pps->main.pkey_index; - } - new_pps->main.state = IB_PORT_PKEY_VALID; - } else if (qp_pps) { + if (qp_attr_mask & IB_QP_PORT) + new_pps->main.port_num = qp_attr->port_num; + else if (qp_pps) new_pps->main.port_num = qp_pps->main.port_num; + + if (qp_attr_mask & IB_QP_PKEY_INDEX) + new_pps->main.pkey_index = qp_attr->pkey_index; + else if (qp_pps) new_pps->main.pkey_index = qp_pps->main.pkey_index; - if (qp_pps->main.state != IB_PORT_PKEY_NOT_VALID) - new_pps->main.state = IB_PORT_PKEY_VALID; - } + + if (((qp_attr_mask & IB_QP_PKEY_INDEX) && + (qp_attr_mask & IB_QP_PORT)) || + (qp_pps && qp_pps->main.state != IB_PORT_PKEY_NOT_VALID)) + new_pps->main.state = IB_PORT_PKEY_VALID; if (qp_attr_mask & IB_QP_ALT_PATH) { new_pps->alt.port_num = qp_attr->alt_port_num; @@ -418,12 +414,15 @@ void ib_close_shared_qp_security(struct ib_qp_security *sec) int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) { - u8 i = rdma_start_port(dev); + unsigned int i; bool is_ib = false; int ret; - while (i <= rdma_end_port(dev) && !is_ib) - is_ib = rdma_protocol_ib(dev, i++); + rdma_for_each_port (dev, i) { + is_ib = rdma_protocol_ib(dev, i); + if (is_ib) + break; + } /* If this isn't an IB device don't create the security context */ if (!is_ib) @@ -539,14 +538,13 @@ void ib_destroy_qp_security_end(struct ib_qp_security *sec) } void ib_security_cache_change(struct ib_device *device, - u8 port_num, + u32 port_num, u64 subnet_prefix) { struct pkey_index_qp_list *pkey; - list_for_each_entry(pkey, - &device->port_pkey_list[port_num].pkey_list, - pkey_index_list) { + list_for_each_entry (pkey, &device->port_data[port_num].pkey_list, + pkey_index_list) { check_pkey_qps(pkey, device, port_num, @@ -554,21 +552,19 @@ void ib_security_cache_change(struct ib_device *device, } } -void ib_security_destroy_port_pkey_list(struct ib_device *device) +void ib_security_release_port_pkey_list(struct ib_device *device) { struct pkey_index_qp_list *pkey, *tmp_pkey; - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { - spin_lock(&device->port_pkey_list[i].list_lock); + rdma_for_each_port (device, i) { list_for_each_entry_safe(pkey, tmp_pkey, - &device->port_pkey_list[i].pkey_list, + &device->port_data[i].pkey_list, pkey_index_list) { list_del(&pkey->pkey_index_list); kfree(pkey); } - spin_unlock(&device->port_pkey_list[i].list_lock); } } @@ -590,7 +586,7 @@ int ib_security_modify_qp(struct ib_qp *qp, WARN_ONCE((qp_attr_mask & IB_QP_PORT && rdma_protocol_ib(real_qp->device, qp_attr->port_num) && !real_qp->qp_sec), - "%s: QP security is not initialized for IB QP: %d\n", + "%s: QP security is not initialized for IB QP: %u\n", __func__, real_qp->qp_num); /* The port/pkey settings are maintained only for the real QP. Open @@ -653,7 +649,7 @@ int ib_security_modify_qp(struct ib_qp *qp, } static int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, + u32 port_num, u16 pkey_index, void *sec) { @@ -668,27 +664,23 @@ static int ib_security_pkey_access(struct ib_device *dev, if (ret) return ret; - ret = ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix); - - if (ret) - return ret; + ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix); return security_ib_pkey_access(sec, subnet_prefix, pkey); } -static int ib_mad_agent_security_change(struct notifier_block *nb, - unsigned long event, - void *data) +void ib_mad_agent_security_change(void) { - struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb); - - if (event != LSM_POLICY_CHANGE) - return NOTIFY_DONE; - - ag->smp_allowed = !security_ib_endport_manage_subnet( - ag->security, dev_name(&ag->device->dev), ag->port_num); - - return NOTIFY_OK; + struct ib_mad_agent *ag; + + spin_lock(&mad_agent_list_lock); + list_for_each_entry(ag, + &mad_agent_list, + mad_agent_sec_list) + WRITE_ONCE(ag->smp_allowed, + !security_ib_endport_manage_subnet(ag->security, + dev_name(&ag->device->dev), ag->port_num)); + spin_unlock(&mad_agent_list_lock); } int ib_mad_agent_security_setup(struct ib_mad_agent *agent, @@ -699,6 +691,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (!rdma_protocol_ib(agent->device, agent->port_num)) return 0; + INIT_LIST_HEAD(&agent->mad_agent_sec_list); + ret = security_ib_alloc_security(&agent->security); if (ret) return ret; @@ -706,20 +700,22 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (qp_type != IB_QPT_SMI) return 0; + spin_lock(&mad_agent_list_lock); ret = security_ib_endport_manage_subnet(agent->security, dev_name(&agent->device->dev), agent->port_num); if (ret) - return ret; - - agent->lsm_nb.notifier_call = ib_mad_agent_security_change; - ret = register_lsm_notifier(&agent->lsm_nb); - if (ret) - return ret; + goto free_security; - agent->smp_allowed = true; - agent->lsm_nb_reg = true; + WRITE_ONCE(agent->smp_allowed, true); + list_add(&agent->mad_agent_sec_list, &mad_agent_list); + spin_unlock(&mad_agent_list_lock); return 0; + +free_security: + spin_unlock(&mad_agent_list_lock); + security_ib_free_security(agent->security); + return ret; } void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) @@ -727,9 +723,13 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) if (!rdma_protocol_ib(agent->device, agent->port_num)) return; + if (agent->qp->qp_type == IB_QPT_SMI) { + spin_lock(&mad_agent_list_lock); + list_del(&agent->mad_agent_sec_list); + spin_unlock(&mad_agent_list_lock); + } + security_ib_free_security(agent->security); - if (agent->lsm_nb_reg) - unregister_lsm_notifier(&agent->lsm_nb); } int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) @@ -738,7 +738,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) return 0; if (map->agent.qp->qp_type == IB_QPT_SMI) { - if (!map->agent.smp_allowed) + if (!READ_ONCE(map->agent.smp_allowed)) return -EACCES; return 0; } diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c index f19b23817c2b..45f09b75c893 100644 --- a/drivers/infiniband/core/smi.c +++ b/drivers/infiniband/core/smi.c @@ -41,7 +41,7 @@ #include "smi.h" #include "opa_smi.h" -static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, u32 port_num, u8 *hop_ptr, u8 hop_cnt, const u8 *initial_path, const u8 *return_path, @@ -127,7 +127,7 @@ static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num, * Return IB_SMI_DISCARD if the SMP should be discarded */ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - bool is_switch, int port_num) + bool is_switch, u32 port_num) { return __smi_handle_dr_smp_send(is_switch, port_num, &smp->hop_ptr, smp->hop_cnt, @@ -139,7 +139,7 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, } enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, - bool is_switch, int port_num) + bool is_switch, u32 port_num) { return __smi_handle_dr_smp_send(is_switch, port_num, &smp->hop_ptr, smp->hop_cnt, @@ -152,7 +152,7 @@ enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, OPA_LID_PERMISSIVE); } -static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, u32 port_num, int phys_port_cnt, u8 *hop_ptr, u8 hop_cnt, const u8 *initial_path, @@ -238,7 +238,7 @@ static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num, * Return IB_SMI_DISCARD if the SMP should be dropped */ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, - int port_num, int phys_port_cnt) + u32 port_num, int phys_port_cnt) { return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, &smp->hop_ptr, smp->hop_cnt, @@ -254,7 +254,7 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, * Return IB_SMI_DISCARD if the SMP should be dropped */ enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, - int port_num, int phys_port_cnt) + u32 port_num, int phys_port_cnt) { return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, &smp->hop_ptr, smp->hop_cnt, diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h index 91d9b353ab85..e350ed623c45 100644 --- a/drivers/infiniband/core/smi.h +++ b/drivers/infiniband/core/smi.h @@ -52,11 +52,11 @@ enum smi_forward_action { }; enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, - int port_num, int phys_port_cnt); + u32 port_num, int phys_port_cnt); int smi_get_fwd_port(struct ib_smp *smp); extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, - bool is_switch, int port_num); + bool is_switch, u32 port_num); /* * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 80f68eb0ba5c..0ed862b38b44 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -43,207 +43,261 @@ #include <rdma/ib_mad.h> #include <rdma/ib_pma.h> #include <rdma/ib_cache.h> +#include <rdma/rdma_counter.h> +#include <rdma/ib_sysfs.h> -struct ib_port; +struct port_table_attribute { + struct ib_port_attribute attr; + char name[8]; + int index; + __be16 attr_id; +}; struct gid_attr_group { - struct ib_port *port; - struct kobject kobj; - struct attribute_group ndev; - struct attribute_group type; + struct ib_port *port; + struct kobject kobj; + struct attribute_group groups[2]; + const struct attribute_group *groups_list[3]; + struct port_table_attribute attrs_list[]; }; + struct ib_port { - struct kobject kobj; - struct ib_device *ibdev; + struct kobject kobj; + struct ib_device *ibdev; struct gid_attr_group *gid_attr_group; - struct attribute_group gid_group; - struct attribute_group pkey_group; - struct attribute_group *pma_table; - struct attribute_group *hw_stats_ag; - struct rdma_hw_stats *hw_stats; - u8 port_num; + struct hw_stats_port_data *hw_stats_data; + + struct attribute_group groups[3]; + const struct attribute_group *groups_list[5]; + u32 port_num; + struct port_table_attribute attrs_list[]; }; -struct port_attribute { - struct attribute attr; - ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf); - ssize_t (*store)(struct ib_port *, struct port_attribute *, +struct hw_stats_device_attribute { + struct device_attribute attr; + ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf); + ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, const char *buf, size_t count); }; -#define PORT_ATTR(_name, _mode, _show, _store) \ -struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store) - -#define PORT_ATTR_RO(_name) \ -struct port_attribute port_attr_##_name = __ATTR_RO(_name) +struct hw_stats_port_attribute { + struct ib_port_attribute attr; + ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf); + ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + const char *buf, size_t count); +}; -struct port_table_attribute { - struct port_attribute attr; - char name[8]; - int index; - __be16 attr_id; +struct hw_stats_device_data { + struct attribute_group group; + struct rdma_hw_stats *stats; + struct hw_stats_device_attribute attrs[]; }; -struct hw_stats_attribute { - struct attribute attr; - ssize_t (*show)(struct kobject *kobj, - struct attribute *attr, char *buf); - ssize_t (*store)(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t count); - int index; - u8 port_num; +struct hw_stats_port_data { + struct rdma_hw_stats *stats; + struct hw_stats_port_attribute attrs[]; }; static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct ib_port, kobj); if (!port_attr->show) return -EIO; - return port_attr->show(p, port_attr, buf); + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); } static ssize_t port_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct ib_port, kobj); if (!port_attr->store) return -EIO; - return port_attr->store(p, port_attr, buf, count); + return port_attr->store(p->ibdev, p->port_num, port_attr, buf, count); } +struct ib_device *ib_port_sysfs_get_ibdev_kobj(struct kobject *kobj, + u32 *port_num) +{ + struct ib_port *port = container_of(kobj, struct ib_port, kobj); + + *port_num = port->port_num; + return port->ibdev; +} +EXPORT_SYMBOL(ib_port_sysfs_get_ibdev_kobj); + static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show, .store = port_attr_store }; +static ssize_t hw_stat_device_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hw_stats_device_attribute *stat_attr = + container_of(attr, struct hw_stats_device_attribute, attr); + struct ib_device *ibdev = container_of(dev, struct ib_device, dev); + + return stat_attr->show(ibdev, ibdev->hw_stats_data->stats, + stat_attr - ibdev->hw_stats_data->attrs, 0, buf); +} + +static ssize_t hw_stat_device_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_device_attribute *stat_attr = + container_of(attr, struct hw_stats_device_attribute, attr); + struct ib_device *ibdev = container_of(dev, struct ib_device, dev); + + return stat_attr->store(ibdev, ibdev->hw_stats_data->stats, + stat_attr - ibdev->hw_stats_data->attrs, 0, buf, + count); +} + +static ssize_t hw_stat_port_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct hw_stats_port_attribute *stat_attr = + container_of(attr, struct hw_stats_port_attribute, attr); + struct ib_port *port = ibdev->port_data[port_num].sysfs; + + return stat_attr->show(ibdev, port->hw_stats_data->stats, + stat_attr - port->hw_stats_data->attrs, + port->port_num, buf); +} + +static ssize_t hw_stat_port_store(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_port_attribute *stat_attr = + container_of(attr, struct hw_stats_port_attribute, attr); + struct ib_port *port = ibdev->port_data[port_num].sysfs; + + return stat_attr->store(ibdev, port->hw_stats_data->stats, + stat_attr - port->hw_stats_data->attrs, + port->port_num, buf, count); +} + static ssize_t gid_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct gid_attr_group, kobj)->port; if (!port_attr->show) return -EIO; - return port_attr->show(p, port_attr, buf); + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); } static const struct sysfs_ops gid_attr_sysfs_ops = { .show = gid_attr_show }; -static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t state_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - static const char *state_name[] = { - [IB_PORT_NOP] = "NOP", - [IB_PORT_DOWN] = "DOWN", - [IB_PORT_INIT] = "INIT", - [IB_PORT_ARMED] = "ARMED", - [IB_PORT_ACTIVE] = "ACTIVE", - [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" - }; - - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "%d: %s\n", attr.state, - attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? - state_name[attr.state] : "UNKNOWN"); + return sysfs_emit(buf, "%d: %s\n", attr.state, + ib_port_state_to_str(attr.state)); } -static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t lid_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "0x%x\n", attr.lid); + return sysfs_emit(buf, "0x%x\n", attr.lid); } -static ssize_t lid_mask_count_show(struct ib_port *p, - struct port_attribute *unused, - char *buf) +static ssize_t lid_mask_count_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "%d\n", attr.lmc); + return sysfs_emit(buf, "%u\n", attr.lmc); } -static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t sm_lid_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "0x%x\n", attr.sm_lid); + return sysfs_emit(buf, "0x%x\n", attr.sm_lid); } -static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t sm_sl_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "%d\n", attr.sm_sl); + return sysfs_emit(buf, "%u\n", attr.sm_sl); } -static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - return sprintf(buf, "0x%08x\n", attr.port_cap_flags); + return sysfs_emit(buf, "0x%08x\n", attr.port_cap_flags); } -static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; char *speed = ""; int rate; /* in deci-Gb/sec */ ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; @@ -272,6 +326,14 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, speed = " HDR"; rate = 500; break; + case IB_SPEED_NDR: + speed = " NDR"; + rate = 1000; + break; + case IB_SPEED_XDR: + speed = " XDR"; + rate = 2000; + break; case IB_SPEED_SDR: default: /* default to SDR for invalid rates */ speed = " SDR"; @@ -283,110 +345,136 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, if (rate < 0) return -EINVAL; - return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", - rate / 10, rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); + return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, + rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), speed); +} + +static const char *phys_state_to_str(enum ib_port_phys_state phys_state) +{ + static const char *phys_state_str[] = { + "<unknown>", + "Sleep", + "Polling", + "Disabled", + "PortConfigurationTraining", + "LinkUp", + "LinkErrorRecovery", + "Phy Test", + }; + + if (phys_state < ARRAY_SIZE(phys_state_str)) + return phys_state_str[phys_state]; + return "<unknown>"; } -static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t phys_state_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; - ret = ib_query_port(p->ibdev, p->port_num, &attr); + ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; - switch (attr.phys_state) { - case 1: return sprintf(buf, "1: Sleep\n"); - case 2: return sprintf(buf, "2: Polling\n"); - case 3: return sprintf(buf, "3: Disabled\n"); - case 4: return sprintf(buf, "4: PortConfigurationTraining\n"); - case 5: return sprintf(buf, "5: LinkUp\n"); - case 6: return sprintf(buf, "6: LinkErrorRecovery\n"); - case 7: return sprintf(buf, "7: Phy Test\n"); - default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state); - } + return sysfs_emit(buf, "%u: %s\n", attr.phys_state, + phys_state_to_str(attr.phys_state)); } -static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, - char *buf) +static ssize_t link_layer_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) { - switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { + const char *output; + + switch (rdma_port_get_link_layer(ibdev, port_num)) { case IB_LINK_LAYER_INFINIBAND: - return sprintf(buf, "%s\n", "InfiniBand"); + output = "InfiniBand"; + break; case IB_LINK_LAYER_ETHERNET: - return sprintf(buf, "%s\n", "Ethernet"); + output = "Ethernet"; + break; default: - return sprintf(buf, "%s\n", "Unknown"); + output = "Unknown"; + break; } + + return sysfs_emit(buf, "%s\n", output); } -static PORT_ATTR_RO(state); -static PORT_ATTR_RO(lid); -static PORT_ATTR_RO(lid_mask_count); -static PORT_ATTR_RO(sm_lid); -static PORT_ATTR_RO(sm_sl); -static PORT_ATTR_RO(cap_mask); -static PORT_ATTR_RO(rate); -static PORT_ATTR_RO(phys_state); -static PORT_ATTR_RO(link_layer); +static IB_PORT_ATTR_RO(state); +static IB_PORT_ATTR_RO(lid); +static IB_PORT_ATTR_RO(lid_mask_count); +static IB_PORT_ATTR_RO(sm_lid); +static IB_PORT_ATTR_RO(sm_sl); +static IB_PORT_ATTR_RO(cap_mask); +static IB_PORT_ATTR_RO(rate); +static IB_PORT_ATTR_RO(phys_state); +static IB_PORT_ATTR_RO(link_layer); static struct attribute *port_default_attrs[] = { - &port_attr_state.attr, - &port_attr_lid.attr, - &port_attr_lid_mask_count.attr, - &port_attr_sm_lid.attr, - &port_attr_sm_sl.attr, - &port_attr_cap_mask.attr, - &port_attr_rate.attr, - &port_attr_phys_state.attr, - &port_attr_link_layer.attr, + &ib_port_attr_state.attr, + &ib_port_attr_lid.attr, + &ib_port_attr_lid_mask_count.attr, + &ib_port_attr_sm_lid.attr, + &ib_port_attr_sm_sl.attr, + &ib_port_attr_cap_mask.attr, + &ib_port_attr_rate.attr, + &ib_port_attr_phys_state.attr, + &ib_port_attr_link_layer.attr, NULL }; +ATTRIBUTE_GROUPS(port_default); -static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) +static ssize_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) { - if (!gid_attr->ndev) - return -EINVAL; - - return sprintf(buf, "%s\n", gid_attr->ndev->name); + struct net_device *ndev; + int ret = -EINVAL; + + rcu_read_lock(); + ndev = rcu_dereference(gid_attr->ndev); + if (ndev) + ret = sysfs_emit(buf, "%s\n", ndev->name); + rcu_read_unlock(); + return ret; } -static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) +static ssize_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) { - return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); + return sysfs_emit(buf, "%s\n", + ib_cache_gid_type_str(gid_attr->gid_type)); } static ssize_t _show_port_gid_attr( - struct ib_port *p, struct port_attribute *attr, char *buf, - size_t (*print)(const struct ib_gid_attr *gid_attr, char *buf)) + struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, + char *buf, + ssize_t (*print)(const struct ib_gid_attr *gid_attr, char *buf)) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); const struct ib_gid_attr *gid_attr; ssize_t ret; - gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index); + gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); if (IS_ERR(gid_attr)) - return PTR_ERR(gid_attr); + /* -EINVAL is returned for user space compatibility reasons. */ + return -EINVAL; ret = print(gid_attr, buf); rdma_put_gid_attr(gid_attr); return ret; } -static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, - char *buf) +static ssize_t show_port_gid(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); const struct ib_gid_attr *gid_attr; - ssize_t ret; + int len; - gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index); + gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); if (IS_ERR(gid_attr)) { const union ib_gid zgid = {}; @@ -399,54 +487,56 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, * space throwing such error on fail to read gid, return zero * GID as before. This maintains backward compatibility. */ - return sprintf(buf, "%pI6\n", zgid.raw); + return sysfs_emit(buf, "%pI6\n", zgid.raw); } - ret = sprintf(buf, "%pI6\n", gid_attr->gid.raw); + len = sysfs_emit(buf, "%pI6\n", gid_attr->gid.raw); rdma_put_gid_attr(gid_attr); - return ret; + return len; } -static ssize_t show_port_gid_attr_ndev(struct ib_port *p, - struct port_attribute *attr, char *buf) +static ssize_t show_port_gid_attr_ndev(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, + char *buf) { - return _show_port_gid_attr(p, attr, buf, print_ndev); + return _show_port_gid_attr(ibdev, port_num, attr, buf, print_ndev); } -static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, - struct port_attribute *attr, +static ssize_t show_port_gid_attr_gid_type(struct ib_device *ibdev, + u32 port_num, + struct ib_port_attribute *attr, char *buf) { - return _show_port_gid_attr(p, attr, buf, print_gid_type); + return _show_port_gid_attr(ibdev, port_num, attr, buf, print_gid_type); } -static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, - char *buf) +static ssize_t show_port_pkey(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); u16 pkey; - ssize_t ret; + int ret; - ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey); + ret = ib_query_pkey(ibdev, port_num, tab_attr->index, &pkey); if (ret) return ret; - return sprintf(buf, "0x%04x\n", pkey); + return sysfs_emit(buf, "0x%04x\n", pkey); } #define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ struct port_table_attribute port_pma_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ - .attr_id = IB_PMA_PORT_COUNTERS , \ + .attr_id = IB_PMA_PORT_COUNTERS, \ } #define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ struct port_table_attribute port_pma_attr_ext_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ .index = (_offset) | ((_width) << 16), \ - .attr_id = IB_PMA_PORT_COUNTERS_EXT , \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT, \ } /* @@ -465,8 +555,8 @@ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, if (!dev->ops.process_mad) return -ENOSYS; - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kzalloc(sizeof(*out_mad), GFP_KERNEL); if (!in_mad || !out_mad) { ret = -ENOMEM; goto out; @@ -481,10 +571,8 @@ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, if (attr != IB_PMA_CLASS_PORT_INFO) in_mad->data[41] = port_num; /* PortSelect field */ - if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY, - port_num, NULL, NULL, - (const struct ib_mad_hdr *)in_mad, mad_size, - (struct ib_mad_hdr *)out_mad, &mad_size, + if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY, port_num, NULL, NULL, + in_mad, out_mad, &mad_size, &out_mad_pkey_index) & (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { @@ -499,47 +587,45 @@ out: return ret; } -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) +static ssize_t show_pma_counter(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); int offset = tab_attr->index & 0xffff; int width = (tab_attr->index >> 16) & 0xff; - ssize_t ret; + int ret; u8 data[8]; + int len; - ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, + ret = get_perf_mad(ibdev, port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) return ret; switch (width) { case 4: - ret = sprintf(buf, "%u\n", (*data >> - (4 - (offset % 8))) & 0xf); + len = sysfs_emit(buf, "%d\n", + (*data >> (4 - (offset % 8))) & 0xf); break; case 8: - ret = sprintf(buf, "%u\n", *data); + len = sysfs_emit(buf, "%u\n", *data); break; case 16: - ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)data)); + len = sysfs_emit(buf, "%u\n", be16_to_cpup((__be16 *)data)); break; case 32: - ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)data)); + len = sysfs_emit(buf, "%u\n", be32_to_cpup((__be32 *)data)); break; case 64: - ret = sprintf(buf, "%llu\n", - be64_to_cpup((__be64 *)data)); + len = sysfs_emit(buf, "%llu\n", be64_to_cpup((__be64 *)data)); break; - default: - ret = 0; + len = 0; + break; } - return ret; + return len; } static PORT_PMA_ATTR(symbol_error , 0, 16, 32); @@ -639,72 +725,49 @@ static struct attribute *pma_attrs_noietf[] = { NULL }; -static struct attribute_group pma_group = { +static const struct attribute_group pma_group = { .name = "counters", .attrs = pma_attrs }; -static struct attribute_group pma_group_ext = { +static const struct attribute_group pma_group_ext = { .name = "counters", .attrs = pma_attrs_ext }; -static struct attribute_group pma_group_noietf = { +static const struct attribute_group pma_group_noietf = { .name = "counters", .attrs = pma_attrs_noietf }; static void ib_port_release(struct kobject *kobj) { - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - struct attribute *a; + struct ib_port *port = container_of(kobj, struct ib_port, kobj); int i; - if (p->gid_group.attrs) { - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); - - kfree(p->gid_group.attrs); - } - - if (p->pkey_group.attrs) { - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); - - kfree(p->pkey_group.attrs); - } - - kfree(p); + for (i = 0; i != ARRAY_SIZE(port->groups); i++) + kfree(port->groups[i].attrs); + if (port->hw_stats_data) + rdma_free_hw_stats_struct(port->hw_stats_data->stats); + kfree(port->hw_stats_data); + kvfree(port); } static void ib_port_gid_attr_release(struct kobject *kobj) { - struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, - kobj); - struct attribute *a; + struct gid_attr_group *gid_attr_group = + container_of(kobj, struct gid_attr_group, kobj); int i; - if (g->ndev.attrs) { - for (i = 0; (a = g->ndev.attrs[i]); ++i) - kfree(a); - - kfree(g->ndev.attrs); - } - - if (g->type.attrs) { - for (i = 0; (a = g->type.attrs[i]); ++i) - kfree(a); - - kfree(g->type.attrs); - } - - kfree(g); + for (i = 0; i != ARRAY_SIZE(gid_attr_group->groups); i++) + kfree(gid_attr_group->groups[i].attrs); + kfree(gid_attr_group); } static struct kobj_type port_type = { .release = ib_port_release, .sysfs_ops = &port_sysfs_ops, - .default_attrs = port_default_attrs + .default_groups = port_default_groups, }; static struct kobj_type gid_attr_type = { @@ -712,55 +775,12 @@ static struct kobj_type gid_attr_type = { .release = ib_port_gid_attr_release }; -static struct attribute ** -alloc_group_attrs(ssize_t (*show)(struct ib_port *, - struct port_attribute *, char *buf), - int len) -{ - struct attribute **tab_attr; - struct port_table_attribute *element; - int i; - - tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); - if (!tab_attr) - return NULL; - - for (i = 0; i < len; i++) { - element = kzalloc(sizeof(struct port_table_attribute), - GFP_KERNEL); - if (!element) - goto err; - - if (snprintf(element->name, sizeof(element->name), - "%d", i) >= sizeof(element->name)) { - kfree(element); - goto err; - } - - element->attr.attr.name = element->name; - element->attr.attr.mode = S_IRUGO; - element->attr.show = show; - element->index = i; - sysfs_attr_init(&element->attr.attr); - - tab_attr[i] = &element->attr.attr; - } - - return tab_attr; - -err: - while (--i >= 0) - kfree(tab_attr[i]); - kfree(tab_attr); - return NULL; -} - /* * Figure out which counter table to use depending on * the device capabilities. */ -static struct attribute_group *get_counter_table(struct ib_device *dev, - int port_num) +static const struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) { struct ib_class_port_info cpi; @@ -780,7 +800,7 @@ static struct attribute_group *get_counter_table(struct ib_device *dev, } static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, - u8 port_num, int index) + u32 port_num, int index) { int ret; @@ -795,74 +815,50 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, return 0; } -static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +static int print_hw_stat(struct ib_device *dev, int port_num, + struct rdma_hw_stats *stats, int index, char *buf) { - return sprintf(buf, "%llu\n", stats->value[index]); + u64 v = rdma_counter_get_hwstat_value(dev, port_num, index); + + return sysfs_emit(buf, "%llu\n", stats->value[index] + v); } -static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t show_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, unsigned int index, + unsigned int port_num, char *buf) { - struct ib_device *dev; - struct ib_port *port; - struct hw_stats_attribute *hsa; - struct rdma_hw_stats *stats; int ret; - hsa = container_of(attr, struct hw_stats_attribute, attr); - if (!hsa->port_num) { - dev = container_of((struct device *)kobj, - struct ib_device, dev); - stats = dev->hw_stats; - } else { - port = container_of(kobj, struct ib_port, kobj); - dev = port->ibdev; - stats = port->hw_stats; - } mutex_lock(&stats->lock); - ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); + ret = update_hw_stats(ibdev, stats, port_num, index); if (ret) goto unlock; - ret = print_hw_stat(stats, hsa->index, buf); + ret = print_hw_stat(ibdev, port_num, stats, index, buf); unlock: mutex_unlock(&stats->lock); return ret; } -static ssize_t show_stats_lifespan(struct kobject *kobj, - struct attribute *attr, +static ssize_t show_stats_lifespan(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf) { - struct hw_stats_attribute *hsa; - struct rdma_hw_stats *stats; int msecs; - hsa = container_of(attr, struct hw_stats_attribute, attr); - if (!hsa->port_num) { - struct ib_device *dev = container_of((struct device *)kobj, - struct ib_device, dev); - - stats = dev->hw_stats; - } else { - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - - stats = p->hw_stats; - } - mutex_lock(&stats->lock); msecs = jiffies_to_msecs(stats->lifespan); mutex_unlock(&stats->lock); - return sprintf(buf, "%d\n", msecs); + return sysfs_emit(buf, "%d\n", msecs); } -static ssize_t set_stats_lifespan(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t count) +static ssize_t set_stats_lifespan(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + const char *buf, size_t count) { - struct hw_stats_attribute *hsa; - struct rdma_hw_stats *stats; int msecs; int jiffies; int ret; @@ -873,17 +869,6 @@ static ssize_t set_stats_lifespan(struct kobject *kobj, if (msecs < 0 || msecs > 10000) return -EINVAL; jiffies = msecs_to_jiffies(msecs); - hsa = container_of(attr, struct hw_stats_attribute, attr); - if (!hsa->port_num) { - struct ib_device *dev = container_of((struct device *)kobj, - struct ib_device, dev); - - stats = dev->hw_stats; - } else { - struct ib_port *p = container_of(kobj, struct ib_port, kobj); - - stats = p->hw_stats; - } mutex_lock(&stats->lock); stats->lifespan = jiffies; @@ -892,361 +877,491 @@ static ssize_t set_stats_lifespan(struct kobject *kobj, return count; } -static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group) +static struct hw_stats_device_data * +alloc_hw_stats_device(struct ib_device *ibdev) { - struct attribute **attr; - - sysfs_remove_group(kobj, attr_group); + struct hw_stats_device_data *data; + struct rdma_hw_stats *stats; - for (attr = attr_group->attrs; *attr; attr++) - kfree(*attr); - kfree(attr_group); -} + if (!ibdev->ops.alloc_hw_device_stats) + return ERR_PTR(-EOPNOTSUPP); + stats = ibdev->ops.alloc_hw_device_stats(ibdev); + if (!stats) + return ERR_PTR(-ENOMEM); + if (!stats->descs || stats->num_counters <= 0) + goto err_free_stats; -static struct attribute *alloc_hsa(int index, u8 port_num, const char *name) -{ - struct hw_stats_attribute *hsa; + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)), + GFP_KERNEL); + if (!data) + goto err_free_stats; + data->group.attrs = kcalloc(stats->num_counters + 2, + sizeof(*data->group.attrs), GFP_KERNEL); + if (!data->group.attrs) + goto err_free_data; - hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); - if (!hsa) - return NULL; + data->group.name = "hw_counters"; + data->stats = stats; + return data; - hsa->attr.name = (char *)name; - hsa->attr.mode = S_IRUGO; - hsa->show = show_hw_stats; - hsa->store = NULL; - hsa->index = index; - hsa->port_num = port_num; +err_free_data: + kfree(data); +err_free_stats: + rdma_free_hw_stats_struct(stats); + return ERR_PTR(-ENOMEM); +} - return &hsa->attr; +void ib_device_release_hw_stats(struct hw_stats_device_data *data) +{ + kfree(data->group.attrs); + rdma_free_hw_stats_struct(data->stats); + kfree(data); } -static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num) +int ib_setup_device_attrs(struct ib_device *ibdev) { - struct hw_stats_attribute *hsa; + struct hw_stats_device_attribute *attr; + struct hw_stats_device_data *data; + bool opstat_skipped = false; + int i, ret, pos = 0; + + data = alloc_hw_stats_device(ibdev); + if (IS_ERR(data)) { + if (PTR_ERR(data) == -EOPNOTSUPP) + return 0; + return PTR_ERR(data); + } + ibdev->hw_stats_data = data; - hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); - if (!hsa) - return NULL; + ret = ibdev->ops.get_hw_stats(ibdev, data->stats, 0, + data->stats->num_counters); + if (ret != data->stats->num_counters) { + if (WARN_ON(ret >= 0)) + return -EINVAL; + return ret; + } + + data->stats->timestamp = jiffies; - hsa->attr.name = name; - hsa->attr.mode = S_IWUSR | S_IRUGO; - hsa->show = show_stats_lifespan; - hsa->store = set_stats_lifespan; - hsa->index = 0; - hsa->port_num = port_num; + for (i = 0; i < data->stats->num_counters; i++) { + if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { + opstat_skipped = true; + continue; + } - return &hsa->attr; + WARN_ON(opstat_skipped); + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = data->stats->descs[i].name; + attr->attr.attr.mode = 0444; + attr->attr.show = hw_stat_device_show; + attr->show = show_hw_stats; + data->group.attrs[pos] = &attr->attr.attr; + pos++; + } + + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = "lifespan"; + attr->attr.attr.mode = 0644; + attr->attr.show = hw_stat_device_show; + attr->show = show_stats_lifespan; + attr->attr.store = hw_stat_device_store; + attr->store = set_stats_lifespan; + data->group.attrs[pos] = &attr->attr.attr; + for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++) + if (!ibdev->groups[i]) { + ibdev->groups[i] = &data->group; + ibdev->hw_stats_attr_index = i; + return 0; + } + WARN(true, "struct ib_device->groups is too small"); + return -EINVAL; } -static void setup_hw_stats(struct ib_device *device, struct ib_port *port, - u8 port_num) +static struct hw_stats_port_data * +alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group) { - struct attribute_group *hsag; + struct ib_device *ibdev = port->ibdev; + struct hw_stats_port_data *data; struct rdma_hw_stats *stats; - int i, ret; - - stats = device->ops.alloc_hw_stats(device, port_num); + if (!ibdev->ops.alloc_hw_port_stats) + return ERR_PTR(-EOPNOTSUPP); + stats = ibdev->ops.alloc_hw_port_stats(port->ibdev, port->port_num); if (!stats) - return; - - if (!stats->names || stats->num_counters <= 0) + return ERR_PTR(-ENOMEM); + if (!stats->descs || stats->num_counters <= 0) goto err_free_stats; /* * Two extra attribue elements here, one for the lifespan entry and * one to NULL terminate the list for the sysfs core code */ - hsag = kzalloc(sizeof(*hsag) + - sizeof(void *) * (stats->num_counters + 2), + data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)), GFP_KERNEL); - if (!hsag) + if (!data) goto err_free_stats; + group->attrs = kcalloc(stats->num_counters + 2, + sizeof(*group->attrs), GFP_KERNEL); + if (!group->attrs) + goto err_free_data; - ret = device->ops.get_hw_stats(device, stats, port_num, - stats->num_counters); - if (ret != stats->num_counters) - goto err_free_hsag; + group->name = "hw_counters"; + data->stats = stats; + return data; - stats->timestamp = jiffies; - - hsag->name = "hw_counters"; - hsag->attrs = (void *)hsag + sizeof(*hsag); +err_free_data: + kfree(data); +err_free_stats: + rdma_free_hw_stats_struct(stats); + return ERR_PTR(-ENOMEM); +} - for (i = 0; i < stats->num_counters; i++) { - hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]); - if (!hsag->attrs[i]) - goto err; - sysfs_attr_init(hsag->attrs[i]); +static int setup_hw_port_stats(struct ib_port *port, + struct attribute_group *group) +{ + struct hw_stats_port_attribute *attr; + struct hw_stats_port_data *data; + bool opstat_skipped = false; + int i, ret, pos = 0; + + data = alloc_hw_stats_port(port, group); + if (IS_ERR(data)) + return PTR_ERR(data); + + ret = port->ibdev->ops.get_hw_stats(port->ibdev, data->stats, + port->port_num, + data->stats->num_counters); + if (ret != data->stats->num_counters) { + if (WARN_ON(ret >= 0)) + return -EINVAL; + return ret; } - mutex_init(&stats->lock); - /* treat an error here as non-fatal */ - hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); - if (hsag->attrs[i]) - sysfs_attr_init(hsag->attrs[i]); + data->stats->timestamp = jiffies; - if (port) { - struct kobject *kobj = &port->kobj; - ret = sysfs_create_group(kobj, hsag); - if (ret) - goto err; - port->hw_stats_ag = hsag; - port->hw_stats = stats; - } else { - struct kobject *kobj = &device->dev.kobj; - ret = sysfs_create_group(kobj, hsag); - if (ret) - goto err; - device->hw_stats_ag = hsag; - device->hw_stats = stats; + for (i = 0; i < data->stats->num_counters; i++) { + if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { + opstat_skipped = true; + continue; + } + + WARN_ON(opstat_skipped); + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = data->stats->descs[i].name; + attr->attr.attr.mode = 0444; + attr->attr.show = hw_stat_port_show; + attr->show = show_hw_stats; + group->attrs[pos] = &attr->attr.attr; + pos++; } - return; + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = "lifespan"; + attr->attr.attr.mode = 0644; + attr->attr.show = hw_stat_port_show; + attr->show = show_stats_lifespan; + attr->attr.store = hw_stat_port_store; + attr->store = set_stats_lifespan; + group->attrs[pos] = &attr->attr.attr; + + port->hw_stats_data = data; + return 0; +} -err: - for (; i >= 0; i--) - kfree(hsag->attrs[i]); -err_free_hsag: - kfree(hsag); -err_free_stats: - kfree(stats); - return; +struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, + u32 port_num) +{ + if (!ibdev->port_data || !rdma_is_port_valid(ibdev, port_num) || + !ibdev->port_data[port_num].sysfs->hw_stats_data) + return NULL; + return ibdev->port_data[port_num].sysfs->hw_stats_data->stats; } -static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static int +alloc_port_table_group(const char *name, struct attribute_group *group, + struct port_table_attribute *attrs, size_t num, + ssize_t (*show)(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *, char *buf)) { - struct ib_port *p; - struct ib_port_attr attr; + struct attribute **attr_list; int i; - int ret; - ret = ib_query_port(device, port_num, &attr); - if (ret) - return ret; - - p = kzalloc(sizeof *p, GFP_KERNEL); - if (!p) + attr_list = kcalloc(num + 1, sizeof(*attr_list), GFP_KERNEL); + if (!attr_list) return -ENOMEM; - p->ibdev = device; - p->port_num = port_num; + for (i = 0; i < num; i++) { + struct port_table_attribute *element = &attrs[i]; - ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_kobj, - "%d", port_num); - if (ret) { - kfree(p); - return ret; - } + if (snprintf(element->name, sizeof(element->name), "%d", i) >= + sizeof(element->name)) + goto err; - p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); - if (!p->gid_attr_group) { - ret = -ENOMEM; - goto err_put; - } + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + element->attr.attr.mode = 0444; + element->attr.show = show; + element->index = i; - p->gid_attr_group->port = p; - ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, - &p->kobj, "gid_attrs"); - if (ret) { - kfree(p->gid_attr_group); - goto err_put; + attr_list[i] = &element->attr.attr; } + group->name = name; + group->attrs = attr_list; + return 0; +err: + kfree(attr_list); + return -EINVAL; +} - if (device->ops.process_mad) { - p->pma_table = get_counter_table(device, port_num); - ret = sysfs_create_group(&p->kobj, p->pma_table); - if (ret) - goto err_put_gid_attrs; - } +/* + * Create the sysfs: + * ibp0s9/ports/XX/gid_attrs/{ndevs,types}/YYY + * YYY is the gid table index in decimal + */ +static int setup_gid_attrs(struct ib_port *port, + const struct ib_port_attr *attr) +{ + struct gid_attr_group *gid_attr_group; + int ret; - p->gid_group.name = "gids"; - p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); - if (!p->gid_group.attrs) { - ret = -ENOMEM; - goto err_remove_pma; - } + gid_attr_group = kzalloc(struct_size(gid_attr_group, attrs_list, + size_mul(attr->gid_tbl_len, 2)), + GFP_KERNEL); + if (!gid_attr_group) + return -ENOMEM; + gid_attr_group->port = port; + kobject_init(&gid_attr_group->kobj, &gid_attr_type); - ret = sysfs_create_group(&p->kobj, &p->gid_group); + ret = alloc_port_table_group("ndevs", &gid_attr_group->groups[0], + gid_attr_group->attrs_list, + attr->gid_tbl_len, + show_port_gid_attr_ndev); if (ret) - goto err_free_gid; + goto err_put; + gid_attr_group->groups_list[0] = &gid_attr_group->groups[0]; - p->gid_attr_group->ndev.name = "ndevs"; - p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, - attr.gid_tbl_len); - if (!p->gid_attr_group->ndev.attrs) { - ret = -ENOMEM; - goto err_remove_gid; - } + ret = alloc_port_table_group( + "types", &gid_attr_group->groups[1], + gid_attr_group->attrs_list + attr->gid_tbl_len, + attr->gid_tbl_len, show_port_gid_attr_gid_type); + if (ret) + goto err_put; + gid_attr_group->groups_list[1] = &gid_attr_group->groups[1]; - ret = sysfs_create_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->ndev); + ret = kobject_add(&gid_attr_group->kobj, &port->kobj, "gid_attrs"); if (ret) - goto err_free_gid_ndev; + goto err_put; + ret = sysfs_create_groups(&gid_attr_group->kobj, + gid_attr_group->groups_list); + if (ret) + goto err_del; + port->gid_attr_group = gid_attr_group; + return 0; - p->gid_attr_group->type.name = "types"; - p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, - attr.gid_tbl_len); - if (!p->gid_attr_group->type.attrs) { - ret = -ENOMEM; - goto err_remove_gid_ndev; - } +err_del: + kobject_del(&gid_attr_group->kobj); +err_put: + kobject_put(&gid_attr_group->kobj); + return ret; +} - ret = sysfs_create_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->type); - if (ret) - goto err_free_gid_type; +static void destroy_gid_attrs(struct ib_port *port) +{ + struct gid_attr_group *gid_attr_group = port->gid_attr_group; - p->pkey_group.name = "pkeys"; - p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, - attr.pkey_tbl_len); - if (!p->pkey_group.attrs) { - ret = -ENOMEM; - goto err_remove_gid_type; - } + if (!gid_attr_group) + return; + sysfs_remove_groups(&gid_attr_group->kobj, gid_attr_group->groups_list); + kobject_del(&gid_attr_group->kobj); + kobject_put(&gid_attr_group->kobj); +} - ret = sysfs_create_group(&p->kobj, &p->pkey_group); +/* + * Create the sysfs: + * ibp0s9/ports/XX/{gids,pkeys,counters}/YYY + */ +static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, + const struct ib_port_attr *attr) +{ + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + bool is_full_dev = &device->coredev == coredev; + const struct attribute_group **cur_group; + struct ib_port *p; + int ret; + + p = kvzalloc(struct_size(p, attrs_list, + size_add(attr->gid_tbl_len, attr->pkey_tbl_len)), + GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + p->ibdev = device; + p->port_num = port_num; + kobject_init(&p->kobj, &port_type); + + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = p; + + cur_group = p->groups_list; + ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list, + attr->gid_tbl_len, show_port_gid); if (ret) - goto err_free_pkey; + goto err_put; + *cur_group++ = &p->groups[0]; - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); + if (attr->pkey_tbl_len) { + ret = alloc_port_table_group("pkeys", &p->groups[1], + p->attrs_list + attr->gid_tbl_len, + attr->pkey_tbl_len, show_port_pkey); if (ret) - goto err_remove_pkey; + goto err_put; + *cur_group++ = &p->groups[1]; } /* * If port == 0, it means hw_counters are per device and not per - * port, so holder should be device. Therefore skip per port conunter - * initialization. + * port, so holder should be device. Therefore skip per port + * counter initialization. */ - if (device->ops.alloc_hw_stats && port_num) - setup_hw_stats(device, p, port_num); - - list_add_tail(&p->kobj.entry, &device->port_list); - - kobject_uevent(&p->kobj, KOBJ_ADD); - return 0; - -err_remove_pkey: - sysfs_remove_group(&p->kobj, &p->pkey_group); - -err_free_pkey: - for (i = 0; i < attr.pkey_tbl_len; ++i) - kfree(p->pkey_group.attrs[i]); - - kfree(p->pkey_group.attrs); - p->pkey_group.attrs = NULL; - -err_remove_gid_type: - sysfs_remove_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->type); - -err_free_gid_type: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_attr_group->type.attrs[i]); + if (port_num && is_full_dev) { + ret = setup_hw_port_stats(p, &p->groups[2]); + if (ret && ret != -EOPNOTSUPP) + goto err_put; + if (!ret) + *cur_group++ = &p->groups[2]; + } - kfree(p->gid_attr_group->type.attrs); - p->gid_attr_group->type.attrs = NULL; + if (device->ops.process_mad && is_full_dev) + *cur_group++ = get_counter_table(device, port_num); -err_remove_gid_ndev: - sysfs_remove_group(&p->gid_attr_group->kobj, - &p->gid_attr_group->ndev); + ret = kobject_add(&p->kobj, coredev->ports_kobj, "%d", port_num); + if (ret) + goto err_put; + ret = sysfs_create_groups(&p->kobj, p->groups_list); + if (ret) + goto err_del; + if (is_full_dev) { + ret = sysfs_create_groups(&p->kobj, device->ops.port_groups); + if (ret) + goto err_groups; + } -err_free_gid_ndev: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_attr_group->ndev.attrs[i]); + list_add_tail(&p->kobj.entry, &coredev->port_list); + return p; - kfree(p->gid_attr_group->ndev.attrs); - p->gid_attr_group->ndev.attrs = NULL; +err_groups: + sysfs_remove_groups(&p->kobj, p->groups_list); +err_del: + kobject_del(&p->kobj); +err_put: + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = NULL; + kobject_put(&p->kobj); + return ERR_PTR(ret); +} -err_remove_gid: - sysfs_remove_group(&p->kobj, &p->gid_group); +static void destroy_port(struct ib_core_device *coredev, struct ib_port *port) +{ + bool is_full_dev = &port->ibdev->coredev == coredev; -err_free_gid: - for (i = 0; i < attr.gid_tbl_len; ++i) - kfree(p->gid_group.attrs[i]); + list_del(&port->kobj.entry); + if (is_full_dev) + sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups); - kfree(p->gid_group.attrs); - p->gid_group.attrs = NULL; + sysfs_remove_groups(&port->kobj, port->groups_list); + kobject_del(&port->kobj); -err_remove_pma: - if (p->pma_table) - sysfs_remove_group(&p->kobj, p->pma_table); + if (port->ibdev->port_data && + port->ibdev->port_data[port->port_num].sysfs == port) + port->ibdev->port_data[port->port_num].sysfs = NULL; -err_put_gid_attrs: - kobject_put(&p->gid_attr_group->kobj); + kobject_put(&port->kobj); +} -err_put: - kobject_put(&p->kobj); - return ret; +static const char *node_type_string(int node_type) +{ + switch (node_type) { + case RDMA_NODE_IB_CA: + return "CA"; + case RDMA_NODE_IB_SWITCH: + return "switch"; + case RDMA_NODE_IB_ROUTER: + return "router"; + case RDMA_NODE_RNIC: + return "RNIC"; + case RDMA_NODE_USNIC: + return "usNIC"; + case RDMA_NODE_USNIC_UDP: + return "usNIC UDP"; + case RDMA_NODE_UNSPECIFIED: + return "unspecified"; + } + return "<unknown>"; } static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); - - switch (dev->node_type) { - case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); - case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); - case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); - case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); - case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); - case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); - default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); - } + struct ib_device *dev = rdma_device_to_ibdev(device); + + return sysfs_emit(buf, "%u: %s\n", dev->node_type, + node_type_string(dev->node_type)); } static DEVICE_ATTR_RO(node_type); static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); + struct ib_device *dev = rdma_device_to_ibdev(device); + __be16 *guid = (__be16 *)&dev->attrs.sys_image_guid; + + return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(guid[0]), + be16_to_cpu(guid[1]), + be16_to_cpu(guid[2]), + be16_to_cpu(guid[3])); } static DEVICE_ATTR_RO(sys_image_guid); static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &dev->node_guid)[0]), - be16_to_cpu(((__be16 *) &dev->node_guid)[1]), - be16_to_cpu(((__be16 *) &dev->node_guid)[2]), - be16_to_cpu(((__be16 *) &dev->node_guid)[3])); + struct ib_device *dev = rdma_device_to_ibdev(device); + __be16 *node_guid = (__be16 *)&dev->node_guid; + + return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(node_guid[0]), + be16_to_cpu(node_guid[1]), + be16_to_cpu(node_guid[2]), + be16_to_cpu(node_guid[3])); } static DEVICE_ATTR_RO(node_guid); static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); - return sprintf(buf, "%.64s\n", dev->node_desc); + return sysfs_emit(buf, "%.64s\n", dev->node_desc); } static ssize_t node_desc_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); struct ib_device_modify desc = {}; int ret; if (!dev->ops.modify_device) - return -EIO; + return -EOPNOTSUPP; memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX)); ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); @@ -1260,11 +1375,12 @@ static DEVICE_ATTR_RW(node_desc); static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); + char version[IB_FW_VERSION_NAME_MAX] = {}; - ib_get_device_fw_str(dev, buf); - strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); - return strlen(buf); + ib_get_device_fw_str(dev, version); + + return sysfs_emit(buf, "%s\n", version); } static DEVICE_ATTR_RO(fw_ver); @@ -1277,93 +1393,81 @@ static struct attribute *ib_dev_attrs[] = { NULL, }; -static const struct attribute_group dev_attr_group = { +const struct attribute_group ib_dev_attr_group = { .attrs = ib_dev_attrs, }; -static void free_port_list_attributes(struct ib_device *device) +void ib_free_port_attrs(struct ib_core_device *coredev) { struct kobject *p, *t; - list_for_each_entry_safe(p, t, &device->port_list, entry) { + list_for_each_entry_safe(p, t, &coredev->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); - list_del(&p->entry); - if (port->hw_stats) { - kfree(port->hw_stats); - free_hsag(&port->kobj, port->hw_stats_ag); - } - if (port->pma_table) - sysfs_remove_group(p, port->pma_table); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - sysfs_remove_group(&port->gid_attr_group->kobj, - &port->gid_attr_group->ndev); - sysfs_remove_group(&port->gid_attr_group->kobj, - &port->gid_attr_group->type); - kobject_put(&port->gid_attr_group->kobj); - kobject_put(p); + destroy_gid_attrs(port); + destroy_port(coredev, port); } - kobject_put(device->ports_kobj); + kobject_put(coredev->ports_kobj); } -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +int ib_setup_port_attrs(struct ib_core_device *coredev) { - struct device *class_dev = &device->dev; + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + u32 port_num; int ret; - int i; - - device->groups[0] = &dev_attr_group; - class_dev->groups = device->groups; - ret = device_add(class_dev); - if (ret) - goto err; + coredev->ports_kobj = kobject_create_and_add("ports", + &coredev->dev.kobj); + if (!coredev->ports_kobj) + return -ENOMEM; - device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); - if (!device->ports_kobj) { - ret = -ENOMEM; - goto err_put; - } + rdma_for_each_port (device, port_num) { + struct ib_port_attr attr; + struct ib_port *port; - if (rdma_cap_ib_switch(device)) { - ret = add_port(device, 0, port_callback); + ret = ib_query_port(device, port_num, &attr); if (ret) goto err_put; - } else { - for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i, port_callback); - if (ret) - goto err_put; - } - } - if (device->ops.alloc_hw_stats) - setup_hw_stats(device, NULL, 0); + port = setup_port(coredev, port_num, &attr); + if (IS_ERR(port)) { + ret = PTR_ERR(port); + goto err_put; + } + ret = setup_gid_attrs(port, &attr); + if (ret) + goto err_put; + } return 0; err_put: - free_port_list_attributes(device); - device_del(class_dev); -err: + ib_free_port_attrs(coredev); return ret; } -void ib_device_unregister_sysfs(struct ib_device *device) +/** + * ib_port_register_client_groups - Add an ib_client's attributes to the port + * + * @ibdev: IB device to add counters + * @port_num: valid port number + * @groups: Group list of attributes + * + * Do not use. Only for legacy sysfs compatibility. + */ +int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups) { - /* Hold device until ib_dealloc_device() */ - get_device(&device->dev); - - free_port_list_attributes(device); - - if (device->hw_stats) { - kfree(device->hw_stats); - free_hsag(&device->dev.kobj, device->hw_stats_ag); - } + return sysfs_create_groups(&ibdev->port_data[port_num].sysfs->kobj, + groups); +} +EXPORT_SYMBOL(ib_port_register_client_groups); - device_unregister(&device->dev); +void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups) +{ + return sysfs_remove_groups(&ibdev->port_data[port_num].sysfs->kobj, + groups); } +EXPORT_SYMBOL(ib_port_unregister_client_groups); diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c new file mode 100644 index 000000000000..31e7860d35bf --- /dev/null +++ b/drivers/infiniband/core/trace.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for core RDMA functions. + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ + +#define CREATE_TRACE_POINTS + +#include <trace/events/rdma_core.h> diff --git a/drivers/infiniband/core/ucaps.c b/drivers/infiniband/core/ucaps.c new file mode 100644 index 000000000000..de5cb8bf0a61 --- /dev/null +++ b/drivers/infiniband/core/ucaps.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include <linux/kref.h> +#include <linux/cdev.h> +#include <linux/mutex.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <rdma/ib_ucaps.h> + +#define RDMA_UCAP_FIRST RDMA_UCAP_MLX5_CTRL_LOCAL + +static DEFINE_MUTEX(ucaps_mutex); +static struct ib_ucap *ucaps_list[RDMA_UCAP_MAX]; +static bool ucaps_class_is_registered; +static dev_t ucaps_base_dev; + +struct ib_ucap { + struct cdev cdev; + struct device dev; + struct kref ref; +}; + +static const char *ucap_names[RDMA_UCAP_MAX] = { + [RDMA_UCAP_MLX5_CTRL_LOCAL] = "mlx5_perm_ctrl_local", + [RDMA_UCAP_MLX5_CTRL_OTHER_VHCA] = "mlx5_perm_ctrl_other_vhca" +}; + +static char *ucaps_devnode(const struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0600; + + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static const struct class ucaps_class = { + .name = "infiniband_ucaps", + .devnode = ucaps_devnode, +}; + +static const struct file_operations ucaps_cdev_fops = { + .owner = THIS_MODULE, + .open = simple_open, +}; + +/** + * ib_cleanup_ucaps - cleanup all API resources and class. + * + * This is called once, when removing the ib_uverbs module. + */ +void ib_cleanup_ucaps(void) +{ + mutex_lock(&ucaps_mutex); + if (!ucaps_class_is_registered) { + mutex_unlock(&ucaps_mutex); + return; + } + + for (int i = RDMA_UCAP_FIRST; i < RDMA_UCAP_MAX; i++) + WARN_ON(ucaps_list[i]); + + class_unregister(&ucaps_class); + ucaps_class_is_registered = false; + unregister_chrdev_region(ucaps_base_dev, RDMA_UCAP_MAX); + mutex_unlock(&ucaps_mutex); +} + +static int get_ucap_from_devt(dev_t devt, u64 *idx_mask) +{ + for (int type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) { + if (ucaps_list[type] && ucaps_list[type]->dev.devt == devt) { + *idx_mask |= 1 << type; + return 0; + } + } + + return -EINVAL; +} + +static int get_devt_from_fd(unsigned int fd, dev_t *ret_dev) +{ + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + *ret_dev = file_inode(file)->i_rdev; + fput(file); + return 0; +} + +/** + * ib_ucaps_init - Initialization required before ucap creation. + * + * Return: 0 on success, or a negative errno value on failure + */ +static int ib_ucaps_init(void) +{ + int ret = 0; + + if (ucaps_class_is_registered) + return ret; + + ret = class_register(&ucaps_class); + if (ret) + return ret; + + ret = alloc_chrdev_region(&ucaps_base_dev, 0, RDMA_UCAP_MAX, + ucaps_class.name); + if (ret < 0) { + class_unregister(&ucaps_class); + return ret; + } + + ucaps_class_is_registered = true; + + return 0; +} + +static void ucap_dev_release(struct device *device) +{ + struct ib_ucap *ucap = container_of(device, struct ib_ucap, dev); + + kfree(ucap); +} + +/** + * ib_create_ucap - Add a ucap character device + * @type: UCAP type + * + * Creates a ucap character device in the /dev/infiniband directory. By default, + * the device has root-only read-write access. + * + * A driver may call this multiple times with the same UCAP type. A reference + * count tracks creations and deletions. + * + * Return: 0 on success, or a negative errno value on failure + */ +int ib_create_ucap(enum rdma_user_cap type) +{ + struct ib_ucap *ucap; + int ret; + + if (type >= RDMA_UCAP_MAX) + return -EINVAL; + + mutex_lock(&ucaps_mutex); + ret = ib_ucaps_init(); + if (ret) + goto unlock; + + ucap = ucaps_list[type]; + if (ucap) { + kref_get(&ucap->ref); + mutex_unlock(&ucaps_mutex); + return 0; + } + + ucap = kzalloc(sizeof(*ucap), GFP_KERNEL); + if (!ucap) { + ret = -ENOMEM; + goto unlock; + } + + device_initialize(&ucap->dev); + ucap->dev.class = &ucaps_class; + ucap->dev.devt = MKDEV(MAJOR(ucaps_base_dev), type); + ucap->dev.release = ucap_dev_release; + ret = dev_set_name(&ucap->dev, "%s", ucap_names[type]); + if (ret) + goto err_device; + + cdev_init(&ucap->cdev, &ucaps_cdev_fops); + ucap->cdev.owner = THIS_MODULE; + + ret = cdev_device_add(&ucap->cdev, &ucap->dev); + if (ret) + goto err_device; + + kref_init(&ucap->ref); + ucaps_list[type] = ucap; + mutex_unlock(&ucaps_mutex); + + return 0; + +err_device: + put_device(&ucap->dev); +unlock: + mutex_unlock(&ucaps_mutex); + return ret; +} +EXPORT_SYMBOL(ib_create_ucap); + +static void ib_release_ucap(struct kref *ref) +{ + struct ib_ucap *ucap = container_of(ref, struct ib_ucap, ref); + enum rdma_user_cap type; + + for (type = RDMA_UCAP_FIRST; type < RDMA_UCAP_MAX; type++) { + if (ucaps_list[type] == ucap) + break; + } + WARN_ON(type == RDMA_UCAP_MAX); + + ucaps_list[type] = NULL; + cdev_device_del(&ucap->cdev, &ucap->dev); + put_device(&ucap->dev); +} + +/** + * ib_remove_ucap - Remove a ucap character device + * @type: User cap type + * + * Removes the ucap character device according to type. The device is completely + * removed from the filesystem when its reference count reaches 0. + */ +void ib_remove_ucap(enum rdma_user_cap type) +{ + struct ib_ucap *ucap; + + mutex_lock(&ucaps_mutex); + ucap = ucaps_list[type]; + if (WARN_ON(!ucap)) + goto end; + + kref_put(&ucap->ref, ib_release_ucap); +end: + mutex_unlock(&ucaps_mutex); +} +EXPORT_SYMBOL(ib_remove_ucap); + +/** + * ib_get_ucaps - Get bitmask of ucap types from file descriptors + * @fds: Array of file descriptors + * @fd_count: Number of file descriptors in the array + * @idx_mask: Bitmask to be updated based on the ucaps in the fd list + * + * Given an array of file descriptors, this function returns a bitmask of + * the ucaps where a bit is set if an FD for that ucap type was in the array. + * + * Return: 0 on success, or a negative errno value on failure + */ +int ib_get_ucaps(int *fds, int fd_count, uint64_t *idx_mask) +{ + int ret = 0; + dev_t dev; + + *idx_mask = 0; + mutex_lock(&ucaps_mutex); + for (int i = 0; i < fd_count; i++) { + ret = get_devt_from_fd(fds[i], &dev); + if (ret) + goto end; + + ret = get_ucap_from_devt(dev, idx_mask); + if (ret) + goto end; + } + +end: + mutex_unlock(&ucaps_mutex); + return ret; +} diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c deleted file mode 100644 index 7541fbaf58a3..000000000000 --- a/drivers/infiniband/core/ucm.c +++ /dev/null @@ -1,1359 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/completion.h> -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/err.h> -#include <linux/poll.h> -#include <linux/sched.h> -#include <linux/file.h> -#include <linux/mount.h> -#include <linux/cdev.h> -#include <linux/idr.h> -#include <linux/mutex.h> -#include <linux/slab.h> - -#include <linux/nospec.h> - -#include <linux/uaccess.h> - -#include <rdma/ib.h> -#include <rdma/ib_cm.h> -#include <rdma/ib_user_cm.h> -#include <rdma/ib_marshall.h> - -#include "core_priv.h" - -MODULE_AUTHOR("Libor Michalek"); -MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); -MODULE_LICENSE("Dual BSD/GPL"); - -struct ib_ucm_device { - int devnum; - struct cdev cdev; - struct device dev; - struct ib_device *ib_dev; -}; - -struct ib_ucm_file { - struct mutex file_mutex; - struct file *filp; - struct ib_ucm_device *device; - - struct list_head ctxs; - struct list_head events; - wait_queue_head_t poll_wait; -}; - -struct ib_ucm_context { - int id; - struct completion comp; - atomic_t ref; - int events_reported; - - struct ib_ucm_file *file; - struct ib_cm_id *cm_id; - __u64 uid; - - struct list_head events; /* list of pending events. */ - struct list_head file_list; /* member in file ctx list */ -}; - -struct ib_ucm_event { - struct ib_ucm_context *ctx; - struct list_head file_list; /* member in file event list */ - struct list_head ctx_list; /* member in ctx event list */ - - struct ib_cm_id *cm_id; - struct ib_ucm_event_resp resp; - void *data; - void *info; - int data_len; - int info_len; -}; - -enum { - IB_UCM_MAJOR = 231, - IB_UCM_BASE_MINOR = 224, - IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS, - IB_UCM_NUM_FIXED_MINOR = 32, - IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR, -}; - -#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) -static dev_t dynamic_ucm_dev; - -static void ib_ucm_add_one(struct ib_device *device); -static void ib_ucm_remove_one(struct ib_device *device, void *client_data); - -static struct ib_client ucm_client = { - .name = "ucm", - .add = ib_ucm_add_one, - .remove = ib_ucm_remove_one -}; - -static DEFINE_MUTEX(ctx_id_mutex); -static DEFINE_IDR(ctx_id_table); -static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES); - -static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id) -{ - struct ib_ucm_context *ctx; - - mutex_lock(&ctx_id_mutex); - ctx = idr_find(&ctx_id_table, id); - if (!ctx) - ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file) - ctx = ERR_PTR(-EINVAL); - else - atomic_inc(&ctx->ref); - mutex_unlock(&ctx_id_mutex); - - return ctx; -} - -static void ib_ucm_ctx_put(struct ib_ucm_context *ctx) -{ - if (atomic_dec_and_test(&ctx->ref)) - complete(&ctx->comp); -} - -static inline int ib_ucm_new_cm_id(int event) -{ - return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED; -} - -static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) -{ - struct ib_ucm_event *uevent; - - mutex_lock(&ctx->file->file_mutex); - list_del(&ctx->file_list); - while (!list_empty(&ctx->events)) { - - uevent = list_entry(ctx->events.next, - struct ib_ucm_event, ctx_list); - list_del(&uevent->file_list); - list_del(&uevent->ctx_list); - mutex_unlock(&ctx->file->file_mutex); - - /* clear incoming connections. */ - if (ib_ucm_new_cm_id(uevent->resp.event)) - ib_destroy_cm_id(uevent->cm_id); - - kfree(uevent); - mutex_lock(&ctx->file->file_mutex); - } - mutex_unlock(&ctx->file->file_mutex); -} - -static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) -{ - struct ib_ucm_context *ctx; - - ctx = kzalloc(sizeof *ctx, GFP_KERNEL); - if (!ctx) - return NULL; - - atomic_set(&ctx->ref, 1); - init_completion(&ctx->comp); - ctx->file = file; - INIT_LIST_HEAD(&ctx->events); - - mutex_lock(&ctx_id_mutex); - ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL); - mutex_unlock(&ctx_id_mutex); - if (ctx->id < 0) - goto error; - - list_add_tail(&ctx->file_list, &file->ctxs); - return ctx; - -error: - kfree(ctx); - return NULL; -} - -static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, - const struct ib_cm_req_event_param *kreq) -{ - ureq->remote_ca_guid = kreq->remote_ca_guid; - ureq->remote_qkey = kreq->remote_qkey; - ureq->remote_qpn = kreq->remote_qpn; - ureq->qp_type = kreq->qp_type; - ureq->starting_psn = kreq->starting_psn; - ureq->responder_resources = kreq->responder_resources; - ureq->initiator_depth = kreq->initiator_depth; - ureq->local_cm_response_timeout = kreq->local_cm_response_timeout; - ureq->flow_control = kreq->flow_control; - ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout; - ureq->retry_count = kreq->retry_count; - ureq->rnr_retry_count = kreq->rnr_retry_count; - ureq->srq = kreq->srq; - ureq->port = kreq->port; - - ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path); - if (kreq->alternate_path) - ib_copy_path_rec_to_user(&ureq->alternate_path, - kreq->alternate_path); -} - -static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, - const struct ib_cm_rep_event_param *krep) -{ - urep->remote_ca_guid = krep->remote_ca_guid; - urep->remote_qkey = krep->remote_qkey; - urep->remote_qpn = krep->remote_qpn; - urep->starting_psn = krep->starting_psn; - urep->responder_resources = krep->responder_resources; - urep->initiator_depth = krep->initiator_depth; - urep->target_ack_delay = krep->target_ack_delay; - urep->failover_accepted = krep->failover_accepted; - urep->flow_control = krep->flow_control; - urep->rnr_retry_count = krep->rnr_retry_count; - urep->srq = krep->srq; -} - -static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep, - const struct ib_cm_sidr_rep_event_param *krep) -{ - urep->status = krep->status; - urep->qkey = krep->qkey; - urep->qpn = krep->qpn; -}; - -static int ib_ucm_event_process(const struct ib_cm_event *evt, - struct ib_ucm_event *uvt) -{ - void *info = NULL; - - switch (evt->event) { - case IB_CM_REQ_RECEIVED: - ib_ucm_event_req_get(&uvt->resp.u.req_resp, - &evt->param.req_rcvd); - uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE; - uvt->resp.present = IB_UCM_PRES_PRIMARY; - uvt->resp.present |= (evt->param.req_rcvd.alternate_path ? - IB_UCM_PRES_ALTERNATE : 0); - break; - case IB_CM_REP_RECEIVED: - ib_ucm_event_rep_get(&uvt->resp.u.rep_resp, - &evt->param.rep_rcvd); - uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE; - break; - case IB_CM_RTU_RECEIVED: - uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_DREQ_RECEIVED: - uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_DREP_RECEIVED: - uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_MRA_RECEIVED: - uvt->resp.u.mra_resp.timeout = - evt->param.mra_rcvd.service_timeout; - uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE; - break; - case IB_CM_REJ_RECEIVED: - uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason; - uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.rej_rcvd.ari_length; - info = evt->param.rej_rcvd.ari; - break; - case IB_CM_LAP_RECEIVED: - ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path, - evt->param.lap_rcvd.alternate_path); - uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE; - uvt->resp.present = IB_UCM_PRES_ALTERNATE; - break; - case IB_CM_APR_RECEIVED: - uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status; - uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.apr_rcvd.info_len; - info = evt->param.apr_rcvd.apr_info; - break; - case IB_CM_SIDR_REQ_RECEIVED: - uvt->resp.u.sidr_req_resp.pkey = - evt->param.sidr_req_rcvd.pkey; - uvt->resp.u.sidr_req_resp.port = - evt->param.sidr_req_rcvd.port; - uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE; - break; - case IB_CM_SIDR_REP_RECEIVED: - ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp, - &evt->param.sidr_rep_rcvd); - uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.sidr_rep_rcvd.info_len; - info = evt->param.sidr_rep_rcvd.info; - break; - default: - uvt->resp.u.send_status = evt->param.send_status; - break; - } - - if (uvt->data_len) { - uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL); - if (!uvt->data) - goto err1; - - uvt->resp.present |= IB_UCM_PRES_DATA; - } - - if (uvt->info_len) { - uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL); - if (!uvt->info) - goto err2; - - uvt->resp.present |= IB_UCM_PRES_INFO; - } - return 0; - -err2: - kfree(uvt->data); -err1: - return -ENOMEM; -} - -static int ib_ucm_event_handler(struct ib_cm_id *cm_id, - const struct ib_cm_event *event) -{ - struct ib_ucm_event *uevent; - struct ib_ucm_context *ctx; - int result = 0; - - ctx = cm_id->context; - - uevent = kzalloc(sizeof *uevent, GFP_KERNEL); - if (!uevent) - goto err1; - - uevent->ctx = ctx; - uevent->cm_id = cm_id; - uevent->resp.uid = ctx->uid; - uevent->resp.id = ctx->id; - uevent->resp.event = event->event; - - result = ib_ucm_event_process(event, uevent); - if (result) - goto err2; - - mutex_lock(&ctx->file->file_mutex); - list_add_tail(&uevent->file_list, &ctx->file->events); - list_add_tail(&uevent->ctx_list, &ctx->events); - wake_up_interruptible(&ctx->file->poll_wait); - mutex_unlock(&ctx->file->file_mutex); - return 0; - -err2: - kfree(uevent); -err1: - /* Destroy new cm_id's */ - return ib_ucm_new_cm_id(event->event); -} - -static ssize_t ib_ucm_event(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_event_get cmd; - struct ib_ucm_event *uevent; - int result = 0; - - if (out_len < sizeof(struct ib_ucm_event_resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - mutex_lock(&file->file_mutex); - while (list_empty(&file->events)) { - mutex_unlock(&file->file_mutex); - - if (file->filp->f_flags & O_NONBLOCK) - return -EAGAIN; - - if (wait_event_interruptible(file->poll_wait, - !list_empty(&file->events))) - return -ERESTARTSYS; - - mutex_lock(&file->file_mutex); - } - - uevent = list_entry(file->events.next, struct ib_ucm_event, file_list); - - if (ib_ucm_new_cm_id(uevent->resp.event)) { - ctx = ib_ucm_ctx_alloc(file); - if (!ctx) { - result = -ENOMEM; - goto done; - } - - ctx->cm_id = uevent->cm_id; - ctx->cm_id->context = ctx; - uevent->resp.id = ctx->id; - } - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &uevent->resp, sizeof(uevent->resp))) { - result = -EFAULT; - goto done; - } - - if (uevent->data) { - if (cmd.data_len < uevent->data_len) { - result = -ENOMEM; - goto done; - } - if (copy_to_user(u64_to_user_ptr(cmd.data), - uevent->data, uevent->data_len)) { - result = -EFAULT; - goto done; - } - } - - if (uevent->info) { - if (cmd.info_len < uevent->info_len) { - result = -ENOMEM; - goto done; - } - if (copy_to_user(u64_to_user_ptr(cmd.info), - uevent->info, uevent->info_len)) { - result = -EFAULT; - goto done; - } - } - - list_del(&uevent->file_list); - list_del(&uevent->ctx_list); - uevent->ctx->events_reported++; - - kfree(uevent->data); - kfree(uevent->info); - kfree(uevent); -done: - mutex_unlock(&file->file_mutex); - return result; -} - -static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_create_id cmd; - struct ib_ucm_create_id_resp resp; - struct ib_ucm_context *ctx; - int result; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - mutex_lock(&file->file_mutex); - ctx = ib_ucm_ctx_alloc(file); - mutex_unlock(&file->file_mutex); - if (!ctx) - return -ENOMEM; - - ctx->uid = cmd.uid; - ctx->cm_id = ib_create_cm_id(file->device->ib_dev, - ib_ucm_event_handler, ctx); - if (IS_ERR(ctx->cm_id)) { - result = PTR_ERR(ctx->cm_id); - goto err1; - } - - resp.id = ctx->id; - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) { - result = -EFAULT; - goto err2; - } - return 0; - -err2: - ib_destroy_cm_id(ctx->cm_id); -err1: - mutex_lock(&ctx_id_mutex); - idr_remove(&ctx_id_table, ctx->id); - mutex_unlock(&ctx_id_mutex); - kfree(ctx); - return result; -} - -static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_destroy_id cmd; - struct ib_ucm_destroy_id_resp resp; - struct ib_ucm_context *ctx; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - mutex_lock(&ctx_id_mutex); - ctx = idr_find(&ctx_id_table, cmd.id); - if (!ctx) - ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file) - ctx = ERR_PTR(-EINVAL); - else - idr_remove(&ctx_id_table, ctx->id); - mutex_unlock(&ctx_id_mutex); - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ib_ucm_ctx_put(ctx); - wait_for_completion(&ctx->comp); - - /* No new events will be generated after destroying the cm_id. */ - ib_destroy_cm_id(ctx->cm_id); - /* Cleanup events not yet reported to the user. */ - ib_ucm_cleanup_events(ctx); - - resp.events_reported = ctx->events_reported; - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - - kfree(ctx); - return result; -} - -static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_attr_id_resp resp; - struct ib_ucm_attr_id cmd; - struct ib_ucm_context *ctx; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - resp.service_id = ctx->cm_id->service_id; - resp.service_mask = ctx->cm_id->service_mask; - resp.local_id = ctx->cm_id->local_id; - resp.remote_id = ctx->cm_id->remote_id; - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - - ib_ucm_ctx_put(ctx); - return result; -} - -static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_uverbs_qp_attr resp; - struct ib_ucm_init_qp_attr cmd; - struct ib_ucm_context *ctx; - struct ib_qp_attr qp_attr; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - resp.qp_attr_mask = 0; - memset(&qp_attr, 0, sizeof qp_attr); - qp_attr.qp_state = cmd.qp_state; - result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); - if (result) - goto out; - - ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - -out: - ib_ucm_ctx_put(ctx); - return result; -} - -static int ucm_validate_listen(__be64 service_id, __be64 service_mask) -{ - service_id &= service_mask; - - if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) || - ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID)) - return -EINVAL; - - return 0; -} - -static ssize_t ib_ucm_listen(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_listen cmd; - struct ib_ucm_context *ctx; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - result = ucm_validate_listen(cmd.service_id, cmd.service_mask); - if (result) - goto out; - - result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask); -out: - ib_ucm_ctx_put(ctx); - return result; -} - -static ssize_t ib_ucm_notify(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_notify cmd; - struct ib_ucm_context *ctx; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event); - ib_ucm_ctx_put(ctx); - return result; -} - -static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) -{ - void *data; - - *dest = NULL; - - if (!len) - return 0; - - data = memdup_user(u64_to_user_ptr(src), len); - if (IS_ERR(data)) - return PTR_ERR(data); - - *dest = data; - return 0; -} - -static int ib_ucm_path_get(struct sa_path_rec **path, u64 src) -{ - struct ib_user_path_rec upath; - struct sa_path_rec *sa_path; - - *path = NULL; - - if (!src) - return 0; - - sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL); - if (!sa_path) - return -ENOMEM; - - if (copy_from_user(&upath, u64_to_user_ptr(src), - sizeof(upath))) { - - kfree(sa_path); - return -EFAULT; - } - - ib_copy_path_rec_from_user(sa_path, &upath); - *path = sa_path; - return 0; -} - -static ssize_t ib_ucm_send_req(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_req_param param; - struct ib_ucm_context *ctx; - struct ib_ucm_req cmd; - int result; - - param.private_data = NULL; - param.primary_path = NULL; - param.alternate_path = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.primary_path, cmd.primary_path); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.alternate_path, cmd.alternate_path); - if (result) - goto done; - - param.private_data_len = cmd.len; - param.service_id = cmd.sid; - param.qp_num = cmd.qpn; - param.qp_type = cmd.qp_type; - param.starting_psn = cmd.psn; - param.peer_to_peer = cmd.peer_to_peer; - param.responder_resources = cmd.responder_resources; - param.initiator_depth = cmd.initiator_depth; - param.remote_cm_response_timeout = cmd.remote_cm_response_timeout; - param.flow_control = cmd.flow_control; - param.local_cm_response_timeout = cmd.local_cm_response_timeout; - param.retry_count = cmd.retry_count; - param.rnr_retry_count = cmd.rnr_retry_count; - param.max_cm_retries = cmd.max_cm_retries; - param.srq = cmd.srq; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_req(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.primary_path); - kfree(param.alternate_path); - return result; -} - -static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_rep_param param; - struct ib_ucm_context *ctx; - struct ib_ucm_rep cmd; - int result; - - param.private_data = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - return result; - - param.qp_num = cmd.qpn; - param.starting_psn = cmd.psn; - param.private_data_len = cmd.len; - param.responder_resources = cmd.responder_resources; - param.initiator_depth = cmd.initiator_depth; - param.failover_accepted = cmd.failover_accepted; - param.flow_control = cmd.flow_control; - param.rnr_retry_count = cmd.rnr_retry_count; - param.srq = cmd.srq; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - ctx->uid = cmd.uid; - result = ib_send_cm_rep(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(param.private_data); - return result; -} - -static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file, - const char __user *inbuf, int in_len, - int (*func)(struct ib_cm_id *cm_id, - const void *private_data, - u8 private_data_len)) -{ - struct ib_ucm_private_data cmd; - struct ib_ucm_context *ctx; - const void *private_data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len); - if (result) - return result; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = func(ctx->cm_id, private_data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(private_data); - return result; -} - -static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu); -} - -static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq); -} - -static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep); -} - -static ssize_t ib_ucm_send_info(struct ib_ucm_file *file, - const char __user *inbuf, int in_len, - int (*func)(struct ib_cm_id *cm_id, - int status, - const void *info, - u8 info_len, - const void *data, - u8 data_len)) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_info cmd; - const void *data = NULL; - const void *info = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len); - if (result) - goto done; - - result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len); - if (result) - goto done; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = func(ctx->cm_id, cmd.status, info, cmd.info_len, - data, cmd.data_len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(data); - kfree(info); - return result; -} - -static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej); -} - -static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr); -} - -static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_mra cmd; - const void *data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); - if (result) - return result; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(data); - return result; -} - -static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct sa_path_rec *path = NULL; - struct ib_ucm_lap cmd; - const void *data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(&path, cmd.path); - if (result) - goto done; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(data); - kfree(path); - return result; -} - -static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_sidr_req_param param = {}; - struct ib_ucm_context *ctx; - struct ib_ucm_sidr_req cmd; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.path, cmd.path); - if (result) - goto done; - - param.private_data_len = cmd.len; - param.service_id = cmd.sid; - param.timeout_ms = cmd.timeout; - param.max_cm_retries = cmd.max_cm_retries; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_sidr_req(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.path); - return result; -} - -static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_sidr_rep_param param; - struct ib_ucm_sidr_rep cmd; - struct ib_ucm_context *ctx; - int result; - - param.info = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, - cmd.data, cmd.data_len); - if (result) - goto done; - - result = ib_ucm_alloc_data(¶m.info, cmd.info, cmd.info_len); - if (result) - goto done; - - param.qp_num = cmd.qpn; - param.qkey = cmd.qkey; - param.status = cmd.status; - param.info_length = cmd.info_len; - param.private_data_len = cmd.data_len; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_sidr_rep(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.info); - return result; -} - -static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) = { - [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id, - [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id, - [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id, - [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen, - [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify, - [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req, - [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep, - [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu, - [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq, - [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep, - [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej, - [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra, - [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap, - [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr, - [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req, - [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep, - [IB_USER_CM_CMD_EVENT] = ib_ucm_event, - [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr, -}; - -static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, - size_t len, loff_t *pos) -{ - struct ib_ucm_file *file = filp->private_data; - struct ib_ucm_cmd_hdr hdr; - ssize_t result; - - if (!ib_safe_file_access(filp)) { - pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", - task_tgid_vnr(current), current->comm); - return -EACCES; - } - - if (len < sizeof(hdr)) - return -EINVAL; - - if (copy_from_user(&hdr, buf, sizeof(hdr))) - return -EFAULT; - - if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) - return -EINVAL; - hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table)); - - if (hdr.in + sizeof(hdr) > len) - return -EINVAL; - - result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr), - hdr.in, hdr.out); - if (!result) - result = len; - - return result; -} - -static __poll_t ib_ucm_poll(struct file *filp, - struct poll_table_struct *wait) -{ - struct ib_ucm_file *file = filp->private_data; - __poll_t mask = 0; - - poll_wait(filp, &file->poll_wait, wait); - - if (!list_empty(&file->events)) - mask = EPOLLIN | EPOLLRDNORM; - - return mask; -} - -/* - * ib_ucm_open() does not need the BKL: - * - * - no global state is referred to; - * - there is no ioctl method to race against; - * - no further module initialization is required for open to work - * after the device is registered. - */ -static int ib_ucm_open(struct inode *inode, struct file *filp) -{ - struct ib_ucm_file *file; - - file = kmalloc(sizeof(*file), GFP_KERNEL); - if (!file) - return -ENOMEM; - - INIT_LIST_HEAD(&file->events); - INIT_LIST_HEAD(&file->ctxs); - init_waitqueue_head(&file->poll_wait); - - mutex_init(&file->file_mutex); - - filp->private_data = file; - file->filp = filp; - file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev); - - return nonseekable_open(inode, filp); -} - -static int ib_ucm_close(struct inode *inode, struct file *filp) -{ - struct ib_ucm_file *file = filp->private_data; - struct ib_ucm_context *ctx; - - mutex_lock(&file->file_mutex); - while (!list_empty(&file->ctxs)) { - ctx = list_entry(file->ctxs.next, - struct ib_ucm_context, file_list); - mutex_unlock(&file->file_mutex); - - mutex_lock(&ctx_id_mutex); - idr_remove(&ctx_id_table, ctx->id); - mutex_unlock(&ctx_id_mutex); - - ib_destroy_cm_id(ctx->cm_id); - ib_ucm_cleanup_events(ctx); - kfree(ctx); - - mutex_lock(&file->file_mutex); - } - mutex_unlock(&file->file_mutex); - kfree(file); - return 0; -} - -static void ib_ucm_release_dev(struct device *dev) -{ - struct ib_ucm_device *ucm_dev; - - ucm_dev = container_of(dev, struct ib_ucm_device, dev); - kfree(ucm_dev); -} - -static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev) -{ - clear_bit(ucm_dev->devnum, dev_map); -} - -static const struct file_operations ucm_fops = { - .owner = THIS_MODULE, - .open = ib_ucm_open, - .release = ib_ucm_close, - .write = ib_ucm_write, - .poll = ib_ucm_poll, - .llseek = no_llseek, -}; - -static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct ib_ucm_device *ucm_dev; - - ucm_dev = container_of(dev, struct ib_ucm_device, dev); - return sprintf(buf, "%s\n", ucm_dev->ib_dev->name); -} -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); - -static void ib_ucm_add_one(struct ib_device *device) -{ - int devnum; - dev_t base; - struct ib_ucm_device *ucm_dev; - - if (!device->ops.alloc_ucontext || !rdma_cap_ib_cm(device, 1)) - return; - - ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); - if (!ucm_dev) - return; - - device_initialize(&ucm_dev->dev); - ucm_dev->ib_dev = device; - ucm_dev->dev.release = ib_ucm_release_dev; - - devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (devnum >= IB_UCM_MAX_DEVICES) - goto err; - ucm_dev->devnum = devnum; - set_bit(devnum, dev_map); - if (devnum >= IB_UCM_NUM_FIXED_MINOR) - base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR; - else - base = IB_UCM_BASE_DEV + devnum; - - cdev_init(&ucm_dev->cdev, &ucm_fops); - ucm_dev->cdev.owner = THIS_MODULE; - kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); - - ucm_dev->dev.class = &cm_class; - ucm_dev->dev.parent = device->dev.parent; - ucm_dev->dev.devt = base; - - dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum); - if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev)) - goto err_devnum; - - if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev)) - goto err_dev; - - ib_set_client_data(device, &ucm_client, ucm_dev); - return; - -err_dev: - cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); -err_devnum: - ib_ucm_free_dev(ucm_dev); -err: - put_device(&ucm_dev->dev); - return; -} - -static void ib_ucm_remove_one(struct ib_device *device, void *client_data) -{ - struct ib_ucm_device *ucm_dev = client_data; - - if (!ucm_dev) - return; - - cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); - ib_ucm_free_dev(ucm_dev); - put_device(&ucm_dev->dev); -} - -static CLASS_ATTR_STRING(abi_version, S_IRUGO, - __stringify(IB_USER_CM_ABI_VERSION)); - -static int __init ib_ucm_init(void) -{ - int ret; - - ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register device number\n"); - goto error1; - } - - ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register dynamic device number\n"); - goto err_alloc; - } - - ret = class_create_file(&cm_class, &class_attr_abi_version.attr); - if (ret) { - pr_err("ucm: couldn't create abi_version attribute\n"); - goto error2; - } - - ret = ib_register_client(&ucm_client); - if (ret) { - pr_err("ucm: couldn't register client\n"); - goto error3; - } - return 0; - -error3: - class_remove_file(&cm_class, &class_attr_abi_version.attr); -error2: - unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); -err_alloc: - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); -error1: - return ret; -} - -static void __exit ib_ucm_cleanup(void) -{ - ib_unregister_client(&ucm_client); - class_remove_file(&cm_class, &class_attr_abi_version.attr); - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); - unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); - idr_destroy(&ctx_id_table); -} - -module_init(ib_ucm_init); -module_exit(ib_ucm_cleanup); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 01d68ed46c1b..ec3be65a2b88 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,9 @@ #include <rdma/rdma_cm_ib.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> +#include <rdma/ib_cm.h> +#include <rdma/rdma_netlink.h> +#include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -66,9 +69,10 @@ static struct ctl_table ucma_ctl_table[] = { .data = &max_backlog, .maxlen = sizeof max_backlog, .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, - { } }; struct ucma_file { @@ -77,34 +81,28 @@ struct ucma_file { struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; - struct workqueue_struct *close_wq; }; struct ucma_context { - int id; + u32 id; struct completion comp; - atomic_t ref; + refcount_t ref; int events_reported; - int backlog; + atomic_t backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; + struct mutex mutex; u64 uid; struct list_head list; struct list_head mc_list; - /* mark that device is in process of destroying the internal HW - * resources, protected by the global mut - */ - int closing; - /* sync between removal event and id destroy, protected by file mut */ - int destroying; struct work_struct close_work; }; struct ucma_multicast { struct ucma_context *ctx; - int id; + u32 id; int events_reported; u64 uid; @@ -115,28 +113,27 @@ struct ucma_multicast { struct ucma_event { struct ucma_context *ctx; + struct ucma_context *conn_req_ctx; struct ucma_multicast *mc; struct list_head list; - struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; - struct work_struct close_work; }; -static DEFINE_MUTEX(mut); -static DEFINE_IDR(ctx_idr); -static DEFINE_IDR(multicast_idr); +static DEFINE_XARRAY_ALLOC(ctx_table); +static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; +static int ucma_destroy_private_ctx(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) { struct ucma_context *ctx; - ctx = idr_find(&ctx_idr, id); + ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file || !ctx->cm_id) + else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } @@ -145,21 +142,18 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) { struct ucma_context *ctx; - mutex_lock(&mut); + xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) { - if (ctx->closing) - ctx = ERR_PTR(-EIO); - else - atomic_inc(&ctx->ref); - } - mutex_unlock(&mut); + if (!IS_ERR(ctx)) + if (!refcount_inc_not_zero(&ctx->ref)) + ctx = ERR_PTR(-ENXIO); + xa_unlock(&ctx_table); return ctx; } static void ucma_put_ctx(struct ucma_context *ctx) { - if (atomic_dec_and_test(&ctx->ref)) + if (refcount_dec_and_test(&ctx->ref)) complete(&ctx->comp); } @@ -180,26 +174,21 @@ static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) return ctx; } -static void ucma_close_event_id(struct work_struct *work) -{ - struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); - - rdma_destroy_id(uevent_close->cm_id); - kfree(uevent_close); -} - static void ucma_close_id(struct work_struct *work) { struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); /* once all inflight tasks are finished, we close all underlying * resources. The context is still alive till its explicit destryoing - * by its creator. + * by its creator. This puts back the xarray's reference. */ ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); + + /* Reading the cm_id without holding a positive ref is not allowed */ + ctx->cm_id = NULL; } static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) @@ -211,46 +200,32 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) return NULL; INIT_WORK(&ctx->close_work, ucma_close_id); - atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); + /* So list_del() will work if we don't do ucma_finish_ctx() */ + INIT_LIST_HEAD(&ctx->list); ctx->file = file; + mutex_init(&ctx->mutex); - mutex_lock(&mut); - ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL); - mutex_unlock(&mut); - if (ctx->id < 0) - goto error; - - list_add_tail(&ctx->list, &file->ctx_list); + if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } return ctx; - -error: - kfree(ctx); - return NULL; } -static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) +static void ucma_set_ctx_cm_id(struct ucma_context *ctx, + struct rdma_cm_id *cm_id) { - struct ucma_multicast *mc; - - mc = kzalloc(sizeof(*mc), GFP_KERNEL); - if (!mc) - return NULL; - - mutex_lock(&mut); - mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL); - mutex_unlock(&mut); - if (mc->id < 0) - goto error; - - mc->ctx = ctx; - list_add_tail(&mc->list, &ctx->mc_list); - return mc; + refcount_set(&ctx->ref, 1); + ctx->cm_id = cm_id; +} -error: - kfree(mc); - return NULL; +static void ucma_finish_ctx(struct ucma_context *ctx) +{ + lockdep_assert_held(&ctx->file->mut); + list_add_tail(&ctx->list, &ctx->file->ctx_list); + xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, @@ -260,7 +235,7 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - dst->responder_resources =src->responder_resources; + dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; @@ -282,10 +257,15 @@ static void ucma_copy_ud_event(struct ib_device *device, dst->qkey = src->qkey; } -static void ucma_set_event_context(struct ucma_context *ctx, - struct rdma_cm_event *event, - struct ucma_event *uevent) +static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, + struct rdma_cm_event *event) { + struct ucma_event *uevent; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) + return NULL; + uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: @@ -300,44 +280,60 @@ static void ucma_set_event_context(struct ucma_context *ctx, uevent->resp.id = ctx->id; break; } + uevent->resp.event = event->event; + uevent->resp.status = event->status; + + if (event->event == RDMA_CM_EVENT_ADDRINFO_RESOLVED) + goto out; + + if (ctx->cm_id->qp_type == IB_QPT_UD) + ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, + &event->param.ud); + else + ucma_copy_conn_event(&uevent->resp.param.conn, + &event->param.conn); + +out: + uevent->resp.ece.vendor_id = event->ece.vendor_id; + uevent->resp.ece.attr_mod = event->ece.attr_mod; + return uevent; } -/* Called with file->mut locked for the relevant context. */ -static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) { - struct ucma_context *ctx = cm_id->context; - struct ucma_event *con_req_eve; - int event_found = 0; + struct ucma_context *listen_ctx = cm_id->context; + struct ucma_context *ctx; + struct ucma_event *uevent; - if (ctx->destroying) - return; + if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) + return -ENOMEM; + ctx = ucma_alloc_ctx(listen_ctx->file); + if (!ctx) + goto err_backlog; + ucma_set_ctx_cm_id(ctx, cm_id); - /* only if context is pointing to cm_id that it owns it and can be - * queued to be closed, otherwise that cm_id is an inflight one that - * is part of that context event list pending to be detached and - * reattached to its new context as part of ucma_get_event, - * handled separately below. - */ - if (ctx->cm_id == cm_id) { - mutex_lock(&mut); - ctx->closing = 1; - mutex_unlock(&mut); - queue_work(ctx->file->close_wq, &ctx->close_work); - return; - } + uevent = ucma_create_uevent(listen_ctx, event); + if (!uevent) + goto err_alloc; + uevent->conn_req_ctx = ctx; + uevent->resp.id = ctx->id; - list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { - if (con_req_eve->cm_id == cm_id && - con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - list_del(&con_req_eve->list); - INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); - queue_work(ctx->file->close_wq, &con_req_eve->close_work); - event_found = 1; - break; - } - } - if (!event_found) - pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n"); + ctx->cm_id->context = ctx; + + mutex_lock(&ctx->file->mut); + ucma_finish_ctx(ctx); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + return 0; + +err_alloc: + ucma_destroy_private_ctx(ctx); +err_backlog: + atomic_inc(&listen_ctx->backlog); + /* Returning error causes the new ID to be destroyed */ + return -ENOMEM; } static int ucma_event_handler(struct rdma_cm_id *cm_id, @@ -345,69 +341,49 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; - int ret = 0; - - uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); - if (!uevent) - return event->event == RDMA_CM_EVENT_CONNECT_REQUEST; - mutex_lock(&ctx->file->mut); - uevent->cm_id = cm_id; - ucma_set_event_context(ctx, event, uevent); - uevent->resp.event = event->event; - uevent->resp.status = event->status; - if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, - &event->param.ud); - else - ucma_copy_conn_event(&uevent->resp.param.conn, - &event->param.conn); + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + return ucma_connect_event_handler(cm_id, event); - if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { - if (!ctx->backlog) { - ret = -ENOMEM; - kfree(uevent); - goto out; - } - ctx->backlog--; - } else if (!ctx->uid || ctx->cm_id != cm_id) { - /* - * We ignore events for new connections until userspace has set - * their context. This can only happen if an error occurs on a - * new connection before the user accepts it. This is okay, - * since the accept will just fail later. However, we do need - * to release the underlying HW resources in case of a device - * removal event. - */ - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); - - kfree(uevent); - goto out; + /* + * We ignore events for new connections until userspace has set their + * context. This can only happen if an error occurs on a new connection + * before the user accepts it. This is okay, since the accept will just + * fail later. However, we do need to release the underlying HW + * resources in case of a device removal event. + */ + if (ctx->uid) { + uevent = ucma_create_uevent(ctx, event); + if (!uevent) + return 0; + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); } - list_add_tail(&uevent->list, &ctx->file->event_list); - wake_up_interruptible(&ctx->file->poll_wait); - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) - ucma_removal_event_handler(cm_id); -out: - mutex_unlock(&ctx->file->mut); - return ret; + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { + xa_lock(&ctx_table); + if (xa_load(&ctx_table, ctx->id) == ctx) + queue_work(system_dfl_wq, &ctx->close_work); + xa_unlock(&ctx_table); + } + return 0; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct ucma_context *ctx; struct rdma_ucm_get_event cmd; struct ucma_event *uevent; - int ret = 0; /* * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ - if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved)) + if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - + sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -427,35 +403,25 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, mutex_lock(&file->mut); } - uevent = list_entry(file->event_list.next, struct ucma_event, list); - - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { - ctx = ucma_alloc_ctx(file); - if (!ctx) { - ret = -ENOMEM; - goto done; - } - uevent->ctx->backlog++; - ctx->cm_id = uevent->cm_id; - ctx->cm_id->context = ctx; - uevent->resp.id = ctx->id; - } + uevent = list_first_entry(&file->event_list, struct ucma_event, list); if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { - ret = -EFAULT; - goto done; + mutex_unlock(&file->mut); + return -EFAULT; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; - kfree(uevent); -done: + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + atomic_inc(&uevent->ctx->backlog); mutex_unlock(&file->mut); - return ret; + + kfree(uevent); + return 0; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) @@ -496,40 +462,32 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, if (ret) return ret; - mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); - mutex_unlock(&file->mut); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; - cm_id = __rdma_create_id(current->nsproxy->net_ns, - ucma_event_handler, ctx, cmd.ps, qp_type, NULL); + cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto err1; } + ucma_set_ctx_cm_id(ctx, cm_id); resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { ret = -EFAULT; - goto err2; + goto err1; } - ctx->cm_id = cm_id; + mutex_lock(&file->mut); + ucma_finish_ctx(ctx); + mutex_unlock(&file->mut); return 0; -err2: - rdma_destroy_id(cm_id); err1: - mutex_lock(&mut); - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); - mutex_lock(&file->mut); - list_del(&ctx->list); - mutex_unlock(&file->mut); - kfree(ctx); + ucma_destroy_private_ctx(ctx); return ret; } @@ -537,19 +495,25 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; - mutex_lock(&mut); + xa_lock(&multicast_table); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); - idr_remove(&multicast_idr, mc->id); + /* + * At this point mc->ctx->ref is 0 so the mc cannot leave the + * lock on the reader and this is enough serialization + */ + __xa_erase(&multicast_table, mc->id); kfree(mc); } - mutex_unlock(&mut); + xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; + rdma_lock_handler(mc->ctx->cm_id); + mutex_lock(&mc->ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; @@ -557,45 +521,75 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) list_del(&uevent->list); kfree(uevent); } + mutex_unlock(&mc->ctx->file->mut); + rdma_unlock_handler(mc->ctx->cm_id); } -/* - * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At - * this point, no new events will be reported from the hardware. However, we - * still need to cleanup the UCMA context for this ID. Specifically, there - * might be events that have not yet been consumed by the user space software. - * These might include pending connect requests which we have not completed - * processing. We cannot call rdma_destroy_id while holding the lock of the - * context (file->mut), as it might cause a deadlock. We therefore extract all - * relevant events from the context pending events list while holding the - * mutex. After that we release them as needed. - */ -static int ucma_free_ctx(struct ucma_context *ctx) +static int ucma_cleanup_ctx_events(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); - - ucma_cleanup_multicast(ctx); - - /* Cleanup events not yet reported to the user. */ + /* Cleanup events not yet reported to the user.*/ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { - if (uevent->ctx == ctx) + if (uevent->ctx != ctx) + continue; + + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && + xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, + uevent->conn_req_ctx, XA_ZERO_ENTRY, + GFP_KERNEL) == uevent->conn_req_ctx) { list_move_tail(&uevent->list, &list); + continue; + } + list_del(&uevent->list); + kfree(uevent); } list_del(&ctx->list); + events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); + /* + * If this was a listening ID then any connections spawned from it that + * have not been delivered to userspace are cleaned up too. Must be done + * outside any locks. + */ list_for_each_entry_safe(uevent, tmp, &list, list) { - list_del(&uevent->list); - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) - rdma_destroy_id(uevent->cm_id); + ucma_destroy_private_ctx(uevent->conn_req_ctx); kfree(uevent); } + return events_reported; +} - events_reported = ctx->events_reported; +/* + * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie + * the ctx is not public to the user). This either because: + * - ucma_finish_ctx() hasn't been called + * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) + */ +static int ucma_destroy_private_ctx(struct ucma_context *ctx) +{ + int events_reported; + + /* + * Destroy the underlying cm_id. New work queuing is prevented now by + * the removal from the xarray. Once the work is cancled ref will either + * be 0 because the work ran to completion and consumed the ref from the + * xarray, or it will be positive because we still have the ref from the + * xarray. This can also be 0 in cases where cm_id was never set + */ + cancel_work_sync(&ctx->close_work); + if (refcount_read(&ctx->ref)) + ucma_close_id(&ctx->close_work); + + events_reported = ucma_cleanup_ctx_events(ctx); + ucma_cleanup_multicast(ctx); + + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, + GFP_KERNEL) != NULL); + mutex_destroy(&ctx->mutex); kfree(ctx); return events_reported; } @@ -614,33 +608,19 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - mutex_lock(&mut); + xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); - if (!IS_ERR(ctx)) - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); + if (!IS_ERR(ctx)) { + if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx) + ctx = ERR_PTR(-ENOENT); + } + xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&ctx->file->mut); - ctx->destroying = 1; - mutex_unlock(&ctx->file->mut); - - flush_workqueue(ctx->file->close_wq); - /* At this point it's guaranteed that there is no inflight - * closing task */ - mutex_lock(&mut); - if (!ctx->closing) { - mutex_unlock(&mut); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - rdma_destroy_id(ctx->cm_id); - } else { - mutex_unlock(&mut); - } - - resp.events_reported = ucma_free_ctx(ctx); + resp.events_reported = ucma_destroy_private_ctx(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -665,7 +645,10 @@ static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); return ret; } @@ -688,7 +671,9 @@ static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -712,8 +697,10 @@ static ssize_t ucma_resolve_ip(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -738,8 +725,32 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_ib_service(struct ucma_file *file, + const char __user *inbuf, int in_len, + int out_len) +{ + struct rdma_ucm_resolve_ib_service cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_resolve_ib_service(ctx->cm_id, &cmd.ibs); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -759,7 +770,9 @@ static ssize_t ucma_resolve_route(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -769,8 +782,8 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, { struct rdma_dev_addr *dev_addr; - resp->num_paths = route->num_paths; - switch (route->num_paths) { + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { case 0: dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, @@ -782,7 +795,7 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); - /* fall through */ + fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); @@ -796,8 +809,8 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { - resp->num_paths = route->num_paths; - switch (route->num_paths) { + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { case 0: rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, (union ib_gid *)&resp->ib_route[0].dgid); @@ -808,7 +821,7 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); - /* fall through */ + fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); @@ -838,7 +851,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -848,6 +861,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? @@ -861,6 +875,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; + resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) @@ -871,8 +886,9 @@ static ssize_t ucma_query_route(struct ucma_file *file, ucma_copy_iw_route(&resp, &ctx->cm_id->route); out: - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) + mutex_unlock(&ctx->mutex); + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, + min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); @@ -886,6 +902,7 @@ static void ucma_query_device_addr(struct rdma_cm_id *cm_id, return; resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); @@ -898,7 +915,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -913,7 +930,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, ucma_query_device_addr(ctx->cm_id, &resp); - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; @@ -932,7 +949,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, if (!resp) return -ENOMEM; - resp->num_paths = ctx->cm_id->route.num_paths; + resp->num_paths = ctx->cm_id->route.num_pri_alt_paths; for (i = 0, out_len -= sizeof(*resp); i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); i++, out_len -= sizeof(struct ib_path_rec_data)) { @@ -951,8 +968,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, } } - if (copy_to_user(response, resp, - sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + if (copy_to_user(response, resp, struct_size(resp, path_data, i))) ret = -EFAULT; kfree(resp); @@ -966,7 +982,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, struct sockaddr_ib *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -999,12 +1015,49 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, &ctx->cm_id->route.addr.dst_addr); } - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; } +static ssize_t ucma_query_ib_service(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_ib_service_resp *resp; + int n, ret = 0; + + if (out_len < sizeof(struct rdma_ucm_query_ib_service_resp)) + return -ENOSPC; + + if (!ctx->cm_id->route.service_recs) + return -ENODATA; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_service_recs = ctx->cm_id->route.num_service_recs; + + n = (out_len - sizeof(struct rdma_ucm_query_ib_service_resp)) / + sizeof(struct ib_user_service_rec); + + if (!n) + goto out; + + if (n > ctx->cm_id->route.num_service_recs) + n = ctx->cm_id->route.num_service_recs; + + memcpy(resp->recs, ctx->cm_id->route.service_recs, + sizeof(*resp->recs) * n); + if (copy_to_user(response, resp, struct_size(resp, recs, n))) + ret = -EFAULT; + +out: + kfree(resp); + return ret; +} + static ssize_t ucma_query(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -1022,6 +1075,7 @@ static ssize_t ucma_query(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); switch (cmd.option) { case RDMA_USER_CM_QUERY_ADDR: ret = ucma_query_addr(ctx, response, out_len); @@ -1032,10 +1086,14 @@ static ssize_t ucma_query(struct ucma_file *file, case RDMA_USER_CM_QUERY_GID: ret = ucma_query_gid(ctx, response, out_len); break; + case RDMA_USER_CM_QUERY_IB_SERVICE: + ret = ucma_query_ib_service(ctx, response, out_len); + break; default: ret = -ENOSYS; break; } + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; @@ -1047,25 +1105,30 @@ static void ucma_copy_conn_param(struct rdma_cm_id *id, { dst->private_data = src->private_data; dst->private_data_len = src->private_data_len; - dst->responder_resources =src->responder_resources; + dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; - dst->qp_num = src->qp_num; + dst->qp_num = src->qp_num & 0xFFFFFF; dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_connect cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; + struct rdma_ucm_connect cmd; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) @@ -1076,7 +1139,14 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); - ret = rdma_connect(ctx->cm_id, &conn_param); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + + mutex_lock(&ctx->mutex); + ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1095,9 +1165,13 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? - cmd.backlog : max_backlog; - ret = rdma_listen(ctx->cm_id, ctx->backlog); + if (cmd.backlog <= 0 || cmd.backlog > max_backlog) + cmd.backlog = max_backlog; + atomic_set(&ctx->backlog, cmd.backlog); + + mutex_lock(&ctx->mutex); + ret = rdma_listen(ctx->cm_id, cmd.backlog); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1107,26 +1181,44 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); - mutex_lock(&file->mut); - ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); - if (!ret) + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); + if (!ret) { + /* The uid must be set atomically with the handler */ ctx->uid = cmd.uid; - mutex_unlock(&file->mut); - } else - ret = __rdma_accept(ctx->cm_id, NULL, NULL); - + } + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } ucma_put_ctx(ctx); return ret; } @@ -1141,11 +1233,25 @@ static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + if (!cmd.reason) + cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; + + switch (cmd.reason) { + case IB_CM_REJ_CONSUMER_DEFINED: + case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: + break; + default: + return -EINVAL; + } + ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len); + mutex_lock(&ctx->mutex); + ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, + cmd.reason); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1164,7 +1270,9 @@ static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); ret = rdma_disconnect(ctx->cm_id); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } @@ -1195,7 +1303,9 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; + mutex_lock(&ctx->mutex); ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); + mutex_unlock(&ctx->mutex); if (ret) goto out; @@ -1236,6 +1346,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; + case RDMA_OPTION_ID_ACK_TIMEOUT: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); + break; default: ret = -ENOSYS; } @@ -1274,9 +1391,13 @@ static int ucma_set_ib_path(struct ucma_context *ctx, struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); + mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &opa); + mutex_unlock(&ctx->mutex); } else { + mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &sa_path); + mutex_unlock(&ctx->mutex); } if (ret) return ret; @@ -1309,7 +1430,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level, switch (level) { case RDMA_OPTION_ID: + mutex_lock(&ctx->mutex); ret = ucma_set_option_id(ctx, optname, optval, optlen); + mutex_unlock(&ctx->mutex); break; case RDMA_OPTION_IB: ret = ucma_set_option_ib(ctx, optname, optval, optlen); @@ -1369,8 +1492,10 @@ static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); + mutex_lock(&ctx->mutex); if (ctx->cm_id->device) ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event); + mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; @@ -1404,46 +1529,59 @@ static ssize_t ucma_process_join(struct ucma_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); - mutex_lock(&file->mut); - mc = ucma_alloc_multicast(ctx); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) { ret = -ENOMEM; - goto err1; + goto err_put_ctx; } + + mc->ctx = ctx; mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); + + xa_lock(&multicast_table); + if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, + GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_mc; + } + + list_add_tail(&mc->list, &ctx->mc_list); + xa_unlock(&multicast_table); + + mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); + mutex_unlock(&ctx->mutex); if (ret) - goto err2; + goto err_xa_erase; resp.id = mc->id; if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; - goto err3; + goto err_leave_multicast; } - mutex_lock(&mut); - idr_replace(&multicast_idr, mc, mc->id); - mutex_unlock(&mut); + xa_store(&multicast_table, mc->id, mc, 0); - mutex_unlock(&file->mut); ucma_put_ctx(ctx); return 0; -err3: +err_leave_multicast: + mutex_lock(&ctx->mutex); rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); + mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); -err2: - mutex_lock(&mut); - idr_remove(&multicast_idr, mc->id); - mutex_unlock(&mut); +err_xa_erase: + xa_lock(&multicast_table); list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); +err_free_mc: + xa_unlock(&multicast_table); kfree(mc); -err1: - mutex_unlock(&file->mut); +err_put_ctx: ucma_put_ctx(ctx); return ret; } @@ -1501,28 +1639,30 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - mutex_lock(&mut); - mc = idr_find(&multicast_idr, cmd.id); + xa_lock(&multicast_table); + mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); - else if (mc->ctx->file != file) + else if (READ_ONCE(mc->ctx->file) != file) mc = ERR_PTR(-EINVAL); - else if (!atomic_inc_not_zero(&mc->ctx->ref)) + else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); - else - idr_remove(&multicast_idr, mc->id); - mutex_unlock(&mut); if (IS_ERR(mc)) { + xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); + + mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); - mutex_lock(&mc->ctx->file->mut); + mutex_unlock(&mc->ctx->mutex); + ucma_cleanup_mc_events(mc); - list_del(&mc->list); - mutex_unlock(&mc->ctx->file->mut); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; @@ -1535,46 +1675,15 @@ out: return ret; } -static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - /* Acquire mutex's based on pointer comparison to prevent deadlock. */ - if (file1 < file2) { - mutex_lock(&file1->mut); - mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING); - } else { - mutex_lock(&file2->mut); - mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING); - } -} - -static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2) -{ - if (file1 < file2) { - mutex_unlock(&file2->mut); - mutex_unlock(&file1->mut); - } else { - mutex_unlock(&file1->mut); - mutex_unlock(&file2->mut); - } -} - -static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file) -{ - struct ucma_event *uevent, *tmp; - - list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) - if (uevent->ctx == ctx) - list_move_tail(&uevent->list, &file->event_list); -} - static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; + struct ucma_event *uevent, *tmp; struct ucma_context *ctx; - struct fd f; + LIST_HEAD(event_list); struct ucma_file *cur_file; int ret = 0; @@ -1582,50 +1691,106 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, return -EFAULT; /* Get current fd to protect against it being closed */ - f = fdget(cmd.fd); - if (!f.file) + CLASS(fd, f)(cmd.fd); + if (fd_empty(f)) return -ENOENT; - if (f.file->f_op != &ucma_fops) { - ret = -EINVAL; - goto file_put; - } + if (fd_file(f)->f_op != &ucma_fops) + return -EINVAL; + cur_file = fd_file(f)->private_data; /* Validate current fd and prevent destruction of id. */ - ctx = ucma_get_ctx(f.file->private_data, cmd.id); - if (IS_ERR(ctx)) { - ret = PTR_ERR(ctx); - goto file_put; - } + ctx = ucma_get_ctx(cur_file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); - cur_file = ctx->file; - if (cur_file == new_file) { - resp.events_reported = ctx->events_reported; - goto response; + rdma_lock_handler(ctx->cm_id); + /* + * ctx->file can only be changed under the handler & xa_lock. xa_load() + * must be checked again to ensure the ctx hasn't begun destruction + * since the ucma_get_ctx(). + */ + xa_lock(&ctx_table); + if (_ucma_find_context(cmd.id, cur_file) != ctx) { + xa_unlock(&ctx_table); + ret = -ENOENT; + goto err_unlock; } + ctx->file = new_file; + xa_unlock(&ctx_table); + mutex_lock(&cur_file->mut); + list_del(&ctx->list); /* - * Migrate events between fd's, maintaining order, and avoiding new - * events being added before existing events. + * At this point lock_handler() prevents addition of new uevents for + * this ctx. */ - ucma_lock_files(cur_file, new_file); - mutex_lock(&mut); - - list_move_tail(&ctx->list, &new_file->ctx_list); - ucma_move_events(ctx, new_file); - ctx->file = new_file; + list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &event_list); resp.events_reported = ctx->events_reported; + mutex_unlock(&cur_file->mut); - mutex_unlock(&mut); - ucma_unlock_files(cur_file, new_file); + mutex_lock(&new_file->mut); + list_add_tail(&ctx->list, &new_file->ctx_list); + list_splice_tail(&event_list, &new_file->event_list); + mutex_unlock(&new_file->mut); -response: if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; +err_unlock: + rdma_unlock_handler(ctx->cm_id); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_write_cm_event(struct ucma_file *file, + const char __user *inbuf, int in_len, + int out_len) +{ + struct rdma_ucm_write_cm_event cmd; + struct rdma_cm_event event = {}; + struct ucma_event *uevent; + struct ucma_context *ctx; + int ret = 0; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if ((cmd.event != RDMA_CM_EVENT_USER) && + (cmd.event != RDMA_CM_EVENT_INTERNAL)) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + event.event = cmd.event; + event.status = cmd.status; + event.param.arg = cmd.param.arg; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) { + ret = -ENOMEM; + goto out; + } + + uevent->ctx = ctx; + uevent->resp.uid = ctx->uid; + uevent->resp.id = ctx->id; + uevent->resp.event = event.event; + uevent->resp.status = event.status; + memcpy(uevent->resp.param.arg32, &event.param.arg, + sizeof(event.param.arg)); + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + +out: ucma_put_ctx(ctx); -file_put: - fdput(f); return ret; } @@ -1654,7 +1819,9 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, [RDMA_USER_CM_CMD_QUERY] = ucma_query, [RDMA_USER_CM_CMD_BIND] = ucma_bind, [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, - [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, + [RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service, + [RDMA_USER_CM_CMD_WRITE_CM_EVENT] = ucma_write_cm_event, }; static ssize_t ucma_write(struct file *filp, const char __user *buf, @@ -1665,8 +1832,8 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf, ssize_t ret; if (!ib_safe_file_access(filp)) { - pr_err_once("ucma_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", - task_tgid_vnr(current), current->comm); + pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", + __func__, task_tgid_vnr(current), current->comm); return -EACCES; } @@ -1722,13 +1889,6 @@ static int ucma_open(struct inode *inode, struct file *filp) if (!file) return -ENOMEM; - file->close_wq = alloc_ordered_workqueue("ucma_close_id", - WQ_MEM_RECLAIM); - if (!file->close_wq) { - kfree(file); - return -ENOMEM; - } - INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); @@ -1737,46 +1897,29 @@ static int ucma_open(struct inode *inode, struct file *filp) filp->private_data = file; file->filp = filp; - return nonseekable_open(inode, filp); + return stream_open(inode, filp); } static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; - struct ucma_context *ctx, *tmp; - mutex_lock(&file->mut); - list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { - ctx->destroying = 1; - mutex_unlock(&file->mut); - - mutex_lock(&mut); - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); - - flush_workqueue(file->close_wq); - /* At that step once ctx was marked as destroying and workqueue - * was flushed we are safe from any inflights handlers that - * might put other closing task. - */ - mutex_lock(&mut); - if (!ctx->closing) { - mutex_unlock(&mut); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - /* rdma_destroy_id ensures that no event handlers are - * inflight for that id before releasing it. - */ - rdma_destroy_id(ctx->cm_id); - } else { - mutex_unlock(&mut); - } + /* + * All paths that touch ctx_list or ctx_list starting from write() are + * prevented by this being a FD release function. The list_add_tail() in + * ucma_connect_event_handler() can run concurrently, however it only + * adds to the list *after* a listening ID. By only reading the first of + * the list, and relying on ucma_destroy_private_ctx() to block + * ucma_connect_event_handler(), no additional locking is needed. + */ + while (!list_empty(&file->ctx_list)) { + struct ucma_context *ctx = list_first_entry( + &file->ctx_list, struct ucma_context, list); - ucma_free_ctx(ctx); - mutex_lock(&file->mut); + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx); + ucma_destroy_private_ctx(ctx); } - mutex_unlock(&file->mut); - destroy_workqueue(file->close_wq); kfree(file); return 0; } @@ -1787,7 +1930,6 @@ static const struct file_operations ucma_fops = { .release = ucma_close, .write = ucma_write, .poll = ucma_poll, - .llseek = no_llseek, }; static struct miscdevice ucma_misc = { @@ -1798,13 +1940,25 @@ static struct miscdevice ucma_misc = { .fops = &ucma_fops, }; -static ssize_t show_abi_version(struct device *dev, - struct device_attribute *attr, - char *buf) +static int ucma_get_global_nl_info(struct ib_client_nl_info *res) { - return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); + res->abi = RDMA_USER_CM_ABI_VERSION; + res->cdev = ucma_misc.this_device; + return 0; } -static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); + +static struct ib_client rdma_cma_client = { + .name = "rdma_cm", + .get_global_nl_info = ucma_get_global_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); + +static ssize_t abi_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); +} +static DEVICE_ATTR_RO(abi_version); static int __init ucma_init(void) { @@ -1826,7 +1980,14 @@ static int __init ucma_init(void) ret = -ENOMEM; goto err2; } + + ret = ib_register_client(&rdma_cma_client); + if (ret) + goto err3; + return 0; +err3: + unregister_net_sysctl_table(ucma_ctl_table_hdr); err2: device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); err1: @@ -1836,11 +1997,10 @@ err1: static void __exit ucma_cleanup(void) { + ib_unregister_client(&rdma_cma_client); unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); - idr_destroy(&ctx_idr); - idr_destroy(&multicast_idr); } module_init(ucma_init); diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 29a45d2f8898..8d3dfef9ebaa 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -41,7 +41,7 @@ #define STRUCT_FIELD(header, field) \ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ - .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_unpacked_ ## header, field), \ .field_name = #header ":" #field static const struct ib_field lrh_table[] = { @@ -462,86 +462,3 @@ int ib_ud_header_pack(struct ib_ud_header *header, return len; } EXPORT_SYMBOL(ib_ud_header_pack); - -/** - * ib_ud_header_unpack - Unpack UD header struct from wire format - * @header:UD header struct - * @buf:Buffer to pack into - * - * ib_ud_header_pack() unpacks the UD header structure @header from wire - * format in the buffer @buf. - */ -int ib_ud_header_unpack(void *buf, - struct ib_ud_header *header) -{ - ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), - buf, &header->lrh); - buf += IB_LRH_BYTES; - - if (header->lrh.link_version != 0) { - pr_warn("Invalid LRH.link_version %d\n", - header->lrh.link_version); - return -EINVAL; - } - - switch (header->lrh.link_next_header) { - case IB_LNH_IBA_LOCAL: - header->grh_present = 0; - break; - - case IB_LNH_IBA_GLOBAL: - header->grh_present = 1; - ib_unpack(grh_table, ARRAY_SIZE(grh_table), - buf, &header->grh); - buf += IB_GRH_BYTES; - - if (header->grh.ip_version != 6) { - pr_warn("Invalid GRH.ip_version %d\n", - header->grh.ip_version); - return -EINVAL; - } - if (header->grh.next_header != 0x1b) { - pr_warn("Invalid GRH.next_header 0x%02x\n", - header->grh.next_header); - return -EINVAL; - } - break; - - default: - pr_warn("Invalid LRH.link_next_header %d\n", - header->lrh.link_next_header); - return -EINVAL; - } - - ib_unpack(bth_table, ARRAY_SIZE(bth_table), - buf, &header->bth); - buf += IB_BTH_BYTES; - - switch (header->bth.opcode) { - case IB_OPCODE_UD_SEND_ONLY: - header->immediate_present = 0; - break; - case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE: - header->immediate_present = 1; - break; - default: - pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); - return -EINVAL; - } - - if (header->bth.transport_header_version != 0) { - pr_warn("Invalid BTH.transport_header_version %d\n", - header->bth.transport_header_version); - return -EINVAL; - } - - ib_unpack(deth_table, ARRAY_SIZE(deth_table), - buf, &header->deth); - buf += IB_DETH_BYTES; - - if (header->immediate_present) - memcpy(&header->immediate_data, buf, sizeof header->immediate_data); - - return 0; -} -EXPORT_SYMBOL(ib_ud_header_unpack); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c6144df47ea4..8137031c2a65 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -2,6 +2,7 @@ * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -37,66 +38,142 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/export.h> -#include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/count_zeros.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" +#define RESCHED_LOOP_CNT_THRESHOLD 0x1000 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { + bool make_dirty = umem->writable && dirty; struct scatterlist *sg; - struct page *page; + unsigned int i; + + if (dirty) + ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, + DMA_BIDIRECTIONAL, 0); + + for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) { + unpin_user_page_range_dirty_lock(sg_page(sg), + DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); + + if (i && !(i % RESCHED_LOOP_CNT_THRESHOLD)) + cond_resched(); + } + + sg_free_append_table(&umem->sgt_append); +} + +/** + * ib_umem_find_best_pgsz - Find best HW page size to use for this MR + * + * @umem: umem struct + * @pgsz_bitmap: bitmap of HW supported page sizes + * @virt: IOVA + * + * This helper is intended for HW that support multiple page + * sizes but can do only a single page size in an MR. + * + * Returns 0 if the umem requires page sizes not supported by + * the driver to be mapped. Drivers always supporting PAGE_SIZE + * or smaller will never see a 0 result. + */ +unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, + unsigned long pgsz_bitmap, + unsigned long virt) +{ + unsigned long curr_len = 0; + dma_addr_t curr_base = ~0; + unsigned long va, pgoff; + struct scatterlist *sg; + dma_addr_t mask; + dma_addr_t end; int i; - if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->npages, - DMA_BIDIRECTIONAL); + umem->iova = va = virt; + + if (umem->is_odp) { + unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift); + + /* ODP must always be self consistent. */ + if (!(pgsz_bitmap & page_size)) + return 0; + return page_size; + } + + /* The best result is the smallest page size that results in the minimum + * number of required pages. Compute the largest page size that could + * work based on VA address bits that don't change. + */ + mask = pgsz_bitmap & + GENMASK(BITS_PER_LONG - 1, + bits_per((umem->length - 1 + virt) ^ virt)); + /* offset into first SGL */ + pgoff = umem->address & ~PAGE_MASK; + + for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { + /* If the current entry is physically contiguous with the previous + * one, no need to take its start addresses into consideration. + */ + if (check_add_overflow(curr_base, curr_len, &end) || + end != sg_dma_address(sg)) { + + curr_base = sg_dma_address(sg); + curr_len = 0; + + /* Reduce max page size if VA/PA bits differ */ + mask |= (curr_base + pgoff) ^ va; + + /* The alignment of any VA matching a discontinuity point + * in the physical memory sets the maximum possible page + * size as this must be a starting point of a new page that + * needs to be aligned. + */ + if (i != 0) + mask |= va; + } - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { + curr_len += sg_dma_len(sg); + va += sg_dma_len(sg) - pgoff; - page = sg_page(sg); - if (!PageDirty(page) && umem->writable && dirty) - set_page_dirty_lock(page); - put_page(page); + pgoff = 0; } - sg_free_table(&umem->sg_head); + /* The mask accumulates 1's in each position where the VA and physical + * address differ, thus the length of trailing 0 is the largest page + * size that can pass the VA through to the physical. + */ + if (mask) + pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); + return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; } +EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** * ib_umem_get - Pin and DMA map userspace memory. * - * If access flags indicate ODP memory, avoid pinning. Instead, stores - * the mm for future page fault handling in conjunction with MMU notifiers. - * - * @context: userspace context to pin memory for + * @device: IB device to connect UMEM * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned - * @dmasync: flush in-flight DMA when the memory region is written */ -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access, int dmasync) +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, + size_t size, int access) { struct ib_umem *umem; struct page **page_list; - struct vm_area_struct **vma_list; unsigned long lock_limit; unsigned long new_pinned; unsigned long cur_base; + unsigned long dma_attr = 0; struct mm_struct *mm; unsigned long npages; - int ret; - int i; - unsigned long dma_attrs = 0; - struct scatterlist *sg, *sg_list_start; - unsigned int gup_flags = FOLL_WRITE; - - if (dmasync) - dma_attrs |= DMA_ATTR_WRITE_BARRIER; + int pinned, ret; + unsigned int gup_flags = FOLL_LONGTERM; /* * If the combination of the addr and size requested for this memory @@ -109,49 +186,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - if (access & IB_ACCESS_ON_DEMAND) { - umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - umem->is_odp = 1; - } else { - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - } + if (access & IB_ACCESS_ON_DEMAND) + return ERR_PTR(-EOPNOTSUPP); - umem->context = context; + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->ibdev = device; umem->length = size; umem->address = addr; - umem->page_shift = PAGE_SHIFT; + /* + * Drivers should call ib_umem_find_best_pgsz() to set the iova + * correctly. + */ + umem->iova = addr; umem->writable = ib_access_writable(access); umem->owning_mm = mm = current->mm; mmgrab(mm); - if (access & IB_ACCESS_ON_DEMAND) { - ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); - if (ret) - goto umem_kfree; - return umem; - } - - /* We assume the memory is from hugetlb until proved otherwise */ - umem->hugetlb = 1; - page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; goto umem_kfree; } - /* - * if we can't alloc the vma_list, it's not so bad; - * just assume the memory is not hugetlb memory - */ - vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); - if (!vma_list) - umem->hugetlb = 0; - npages = ib_umem_num_pages(umem); if (npages == 0 || npages > UINT_MAX) { ret = -EINVAL; @@ -160,80 +218,55 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(&mm->mmap_sem); - if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || - (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { - up_write(&mm->mmap_sem); + new_pinned = atomic64_add_return(npages, &mm->pinned_vm); + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { + atomic64_sub(npages, &mm->pinned_vm); ret = -ENOMEM; goto out; } - mm->pinned_vm = new_pinned; - up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); - if (ret) - goto vma; - - if (!umem->writable) - gup_flags |= FOLL_FORCE; - - sg_list_start = umem->sg_head.sgl; + if (umem->writable) + gup_flags |= FOLL_WRITE; while (npages) { - down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(cur_base, - min_t(unsigned long, npages, - PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, vma_list); - if (ret < 0) { - up_read(&mm->mmap_sem); + cond_resched(); + pinned = pin_user_pages_fast(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / + sizeof(struct page *)), + gup_flags, page_list); + if (pinned < 0) { + ret = pinned; goto umem_release; } - umem->npages += ret; - cur_base += ret * PAGE_SIZE; - npages -= ret; - - /* Continue to hold the mmap_sem as vma_list access - * needs to be protected. - */ - for_each_sg(sg_list_start, sg, ret, i) { - if (vma_list && !is_vm_hugetlb_page(vma_list[i])) - umem->hugetlb = 0; - - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); + cur_base += pinned * PAGE_SIZE; + npages -= pinned; + ret = sg_alloc_append_table_from_pages( + &umem->sgt_append, page_list, pinned, 0, + pinned << PAGE_SHIFT, ib_dma_max_seg_size(device), + npages, GFP_KERNEL); + if (ret) { + unpin_user_pages_dirty_lock(page_list, pinned, 0); + goto umem_release; } - up_read(&mm->mmap_sem); - - /* preparing for next loop */ - sg_list_start = sg; } - umem->nmap = ib_dma_map_sg_attrs(context->device, - umem->sg_head.sgl, - umem->npages, - DMA_BIDIRECTIONAL, - dma_attrs); + if (access & IB_ACCESS_RELAXED_ORDERING) + dma_attr |= DMA_ATTR_WEAK_ORDERING; - if (!umem->nmap) { - ret = -ENOMEM; + ret = ib_dma_map_sgtable_attrs(device, &umem->sgt_append.sgt, + DMA_BIDIRECTIONAL, dma_attr); + if (ret) goto umem_release; - } - - ret = 0; goto out; umem_release: - __ib_umem_release(context->device, umem, 0); -vma: - down_write(&mm->mmap_sem); - mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&mm->mmap_sem); + __ib_umem_release(device, umem, 0); + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: - if (vma_list) - free_page((unsigned long) vma_list); free_page((unsigned long) page_list); umem_kfree: if (ret) { @@ -244,82 +277,26 @@ umem_kfree: } EXPORT_SYMBOL(ib_umem_get); -static void __ib_umem_release_tail(struct ib_umem *umem) -{ - mmdrop(umem->owning_mm); - if (umem->is_odp) - kfree(to_ib_umem_odp(umem)); - else - kfree(umem); -} - -static void ib_umem_release_defer(struct work_struct *work) -{ - struct ib_umem *umem = container_of(work, struct ib_umem, work); - - down_write(&umem->owning_mm->mmap_sem); - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&umem->owning_mm->mmap_sem); - - __ib_umem_release_tail(umem); -} - /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release */ void ib_umem_release(struct ib_umem *umem) { - struct ib_ucontext *context = umem->context; - - if (umem->is_odp) { - ib_umem_odp_release(to_ib_umem_odp(umem)); - __ib_umem_release_tail(umem); + if (!umem) return; - } - - __ib_umem_release(umem->context->device, umem, 1); - - /* - * We may be called with the mm's mmap_sem already held. This - * can happen when a userspace munmap() is the call that drops - * the last reference to our file and calls our release - * method. If there are memory regions to destroy, we'll end - * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting a workqueue. - */ - if (context->closing) { - if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_release_defer); - queue_work(ib_wq, &umem->work); - return; - } - } else { - down_write(&umem->owning_mm->mmap_sem); - } - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&umem->owning_mm->mmap_sem); - - __ib_umem_release_tail(umem); -} -EXPORT_SYMBOL(ib_umem_release); - -int ib_umem_page_count(struct ib_umem *umem) -{ - int i; - int n; - struct scatterlist *sg; - + if (umem->is_dmabuf) + return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem)); if (umem->is_odp) - return ib_umem_num_pages(umem); + return ib_umem_odp_release(to_ib_umem_odp(umem)); - n = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> umem->page_shift; + __ib_umem_release(umem->ibdev, umem, 1); - return n; + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); + mmdrop(umem->owning_mm); + kfree(umem); } -EXPORT_SYMBOL(ib_umem_page_count); +EXPORT_SYMBOL(ib_umem_release); /* * Copy from the given ib_umem's pages to the given buffer. @@ -338,12 +315,13 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, int ret; if (offset > umem->length || length > umem->length - offset) { - pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", - offset, umem->length, end); + pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n", + __func__, offset, umem->length, end); return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, + ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl, + umem->sgt_append.sgt.orig_nents, dst, length, offset + ib_umem_offset(umem)); if (ret < 0) diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c new file mode 100644 index 000000000000..0ec2e4120cc9 --- /dev/null +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + */ + +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> +#include <linux/dma-mapping.h> +#include <linux/module.h> + +#include "uverbs.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct sg_table *sgt; + struct scatterlist *sg; + unsigned long start, end, cur = 0; + unsigned int nmap = 0; + long ret; + int i; + + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + if (umem_dmabuf->revoked) + return -EINVAL; + + if (umem_dmabuf->sgt) + goto wait_fence; + + sgt = dma_buf_map_attachment(umem_dmabuf->attach, + DMA_BIDIRECTIONAL); + if (IS_ERR(sgt)) + return PTR_ERR(sgt); + + /* modify the sg list in-place to match umem address and length */ + + start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE); + end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length, + PAGE_SIZE); + for_each_sgtable_dma_sg(sgt, sg, i) { + if (start < cur + sg_dma_len(sg) && cur < end) + nmap++; + if (cur <= start && start < cur + sg_dma_len(sg)) { + unsigned long offset = start - cur; + + umem_dmabuf->first_sg = sg; + umem_dmabuf->first_sg_offset = offset; + sg_dma_address(sg) += offset; + sg_dma_len(sg) -= offset; + cur += offset; + } + if (cur < end && end <= cur + sg_dma_len(sg)) { + unsigned long trim = cur + sg_dma_len(sg) - end; + + umem_dmabuf->last_sg = sg; + umem_dmabuf->last_sg_trim = trim; + sg_dma_len(sg) -= trim; + break; + } + cur += sg_dma_len(sg); + } + + umem_dmabuf->umem.sgt_append.sgt.sgl = umem_dmabuf->first_sg; + umem_dmabuf->umem.sgt_append.sgt.nents = nmap; + umem_dmabuf->sgt = sgt; + +wait_fence: + /* + * Although the sg list is valid now, the content of the pages + * may be not up-to-date. Wait for the exporter to finish + * the migration. + */ + ret = dma_resv_wait_timeout(umem_dmabuf->attach->dmabuf->resv, + DMA_RESV_USAGE_KERNEL, + false, MAX_SCHEDULE_TIMEOUT); + if (ret < 0) + return ret; + if (ret == 0) + return -ETIMEDOUT; + return 0; +} +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages); + +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) +{ + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + if (!umem_dmabuf->sgt) + return; + + /* retore the original sg list */ + if (umem_dmabuf->first_sg) { + sg_dma_address(umem_dmabuf->first_sg) -= + umem_dmabuf->first_sg_offset; + sg_dma_len(umem_dmabuf->first_sg) += + umem_dmabuf->first_sg_offset; + umem_dmabuf->first_sg = NULL; + umem_dmabuf->first_sg_offset = 0; + } + if (umem_dmabuf->last_sg) { + sg_dma_len(umem_dmabuf->last_sg) += + umem_dmabuf->last_sg_trim; + umem_dmabuf->last_sg = NULL; + umem_dmabuf->last_sg_trim = 0; + } + + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, + DMA_BIDIRECTIONAL); + + umem_dmabuf->sgt = NULL; +} +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages); + +static struct ib_umem_dmabuf * +ib_umem_dmabuf_get_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) +{ + struct dma_buf *dmabuf; + struct ib_umem_dmabuf *umem_dmabuf; + struct ib_umem *umem; + unsigned long end; + struct ib_umem_dmabuf *ret = ERR_PTR(-EINVAL); + + if (check_add_overflow(offset, (unsigned long)size, &end)) + return ret; + + if (unlikely(!ops || !ops->move_notify)) + return ret; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + if (dmabuf->size < end) + goto out_release_dmabuf; + + umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL); + if (!umem_dmabuf) { + ret = ERR_PTR(-ENOMEM); + goto out_release_dmabuf; + } + + umem = &umem_dmabuf->umem; + umem->ibdev = device; + umem->length = size; + umem->address = offset; + umem->writable = ib_access_writable(access); + umem->is_dmabuf = 1; + + if (!ib_umem_num_pages(umem)) + goto out_free_umem; + + umem_dmabuf->attach = dma_buf_dynamic_attach( + dmabuf, + dma_device, + ops, + umem_dmabuf); + if (IS_ERR(umem_dmabuf->attach)) { + ret = ERR_CAST(umem_dmabuf->attach); + goto out_free_umem; + } + return umem_dmabuf; + +out_free_umem: + kfree(umem_dmabuf); + +out_release_dmabuf: + dma_buf_put(dmabuf); + return ret; +} + +struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) +{ + return ib_umem_dmabuf_get_with_dma_device(device, device->dma_device, + offset, size, fd, access, ops); +} +EXPORT_SYMBOL(ib_umem_dmabuf_get); + +static void +ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach) +{ + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; + + ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev, + "Invalidate callback should not be called when memory is pinned\n"); +} + +static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { + .allow_peer2peer = true, + .move_notify = ib_umem_dmabuf_unsupported_move_notify, +}; + +struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access) +{ + struct ib_umem_dmabuf *umem_dmabuf; + int err; + + umem_dmabuf = ib_umem_dmabuf_get_with_dma_device(device, dma_device, offset, + size, fd, access, + &ib_umem_dmabuf_attach_pinned_ops); + if (IS_ERR(umem_dmabuf)) + return umem_dmabuf; + + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + err = dma_buf_pin(umem_dmabuf->attach); + if (err) + goto err_release; + umem_dmabuf->pinned = 1; + + err = ib_umem_dmabuf_map_pages(umem_dmabuf); + if (err) + goto err_unpin; + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + + return umem_dmabuf; + +err_unpin: + dma_buf_unpin(umem_dmabuf->attach); +err_release: + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device); + +struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, + unsigned long offset, + size_t size, int fd, + int access) +{ + return ib_umem_dmabuf_get_pinned_with_dma_device(device, device->dma_device, + offset, size, fd, access); +} +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); + +void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + dma_resv_lock(dmabuf->resv, NULL); + if (umem_dmabuf->revoked) + goto end; + ib_umem_dmabuf_unmap_pages(umem_dmabuf); + if (umem_dmabuf->pinned) { + dma_buf_unpin(umem_dmabuf->attach); + umem_dmabuf->pinned = 0; + } + umem_dmabuf->revoked = 1; +end: + dma_resv_unlock(dmabuf->resv); +} +EXPORT_SYMBOL(ib_umem_dmabuf_revoke); + +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + ib_umem_dmabuf_revoke(umem_dmabuf); + + dma_buf_detach(dmabuf, umem_dmabuf->attach); + dma_buf_put(dmabuf); + kfree(umem_dmabuf); +} diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index a4ec43093cb3..572a91a62a7b 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -39,421 +39,241 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hugetlb.h> -#include <linux/interval_tree_generic.h> +#include <linux/interval_tree.h> +#include <linux/hmm.h> +#include <linux/hmm-dma.h> +#include <linux/pagemap.h> -#include <rdma/ib_verbs.h> -#include <rdma/ib_umem.h> #include <rdma/ib_umem_odp.h> -/* - * The ib_umem list keeps track of memory regions for which the HW - * device request to receive notification when the related memory - * mapping is changed. - * - * ib_umem_lock protects the list. - */ - -static u64 node_start(struct umem_odp_node *n) -{ - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_start(&umem_odp->umem); -} +#include "uverbs.h" -/* Note that the representation of the intervals in the interval tree - * considers the ending point as contained in the interval, while the - * function ib_umem_end returns the first address which is not contained - * in the umem. - */ -static u64 node_last(struct umem_odp_node *n) +static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp) { - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_end(&umem_odp->umem) - 1; -} - -INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, - node_start, node_last, static, rbt_ib_umem) - -static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(&umem_odp->umem_mutex); - if (umem_odp->notifiers_count++ == 0) - /* - * Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one should be - * waiting right now. - */ - reinit_completion(&umem_odp->notifier_completion); - mutex_unlock(&umem_odp->umem_mutex); + umem_odp->is_implicit_odp = 1; + umem_odp->umem.is_odp = 1; + mutex_init(&umem_odp->umem_mutex); } -static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) +static int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + const struct mmu_interval_notifier_ops *ops) { - mutex_lock(&umem_odp->umem_mutex); - /* - * This sequence increase will notify the QP page fault that the page - * that is going to be mapped in the spte could have been freed. - */ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(&umem_odp->notifier_completion); - mutex_unlock(&umem_odp->umem_mutex); -} + struct ib_device *dev = umem_odp->umem.ibdev; + size_t page_size = 1UL << umem_odp->page_shift; + struct hmm_dma_map *map; + unsigned long start; + unsigned long end; + size_t nr_entries; + int ret = 0; -static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, - u64 start, u64 end, void *cookie) -{ - struct ib_umem *umem = &umem_odp->umem; + umem_odp->umem.is_odp = 1; + mutex_init(&umem_odp->umem_mutex); + start = ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, &end)) + return -EOVERFLOW; + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) + return -EOVERFLOW; /* - * Increase the number of notifiers running, to - * prevent any further fault handling on this MR. + * The mmu notifier can be called within reclaim contexts and takes the + * umem_mutex. This is rare to trigger in testing, teach lockdep about + * it. */ - ib_umem_notifier_start_account(umem_odp); - umem_odp->dying = 1; - /* Make sure that the fact the umem is dying is out before we release - * all pending page faults. */ - smp_wmb(); - complete_all(&umem_odp->notifier_completion); - umem->context->invalidate_range(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); - return 0; -} - -static void ib_umem_notifier_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - - down_read(&per_mm->umem_rwsem); - if (per_mm->active) - rbt_ib_umem_for_each_in_range( - &per_mm->umem_tree, 0, ULLONG_MAX, - ib_umem_notifier_release_trampoline, true, NULL); - up_read(&per_mm->umem_rwsem); -} - -static int invalidate_range_start_trampoline(struct ib_umem_odp *item, - u64 start, u64 end, void *cookie) -{ - ib_umem_notifier_start_account(item); - item->umem.context->invalidate_range(item, start, end); - return 0; -} - -static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - - if (range->blockable) - down_read(&per_mm->umem_rwsem); - else if (!down_read_trylock(&per_mm->umem_rwsem)) - return -EAGAIN; - - if (!per_mm->active) { - up_read(&per_mm->umem_rwsem); - /* - * At this point active is permanently set and visible to this - * CPU without a lock, that fact is relied on to skip the unlock - * in range_end. - */ - return 0; + if (IS_ENABLED(CONFIG_LOCKDEP)) { + fs_reclaim_acquire(GFP_KERNEL); + mutex_lock(&umem_odp->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); + fs_reclaim_release(GFP_KERNEL); } - return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, - range->end, - invalidate_range_start_trampoline, - range->blockable, NULL); -} - -static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, - u64 end, void *cookie) -{ - ib_umem_notifier_end_account(item); - return 0; -} - -static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - - if (unlikely(!per_mm->active)) - return; + nr_entries = (end - start) >> PAGE_SHIFT; + if (!(nr_entries * PAGE_SIZE / page_size)) + return -EINVAL; - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, - range->end, - invalidate_range_end_trampoline, true, NULL); - up_read(&per_mm->umem_rwsem); -} + map = &umem_odp->map; + if (ib_uses_virt_dma(dev)) { + map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), + GFP_KERNEL | __GFP_NOWARN); + if (!map->pfn_list) + ret = -ENOMEM; + } else + ret = hmm_dma_map_alloc(dev->dma_device, map, + (end - start) >> PAGE_SHIFT, + 1 << umem_odp->page_shift); + if (ret) + return ret; + + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, start, + end - start, ops); + if (ret) + goto out_free_map; -static const struct mmu_notifier_ops ib_umem_notifiers = { - .release = ib_umem_notifier_release, - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, -}; + return 0; -static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_umem *umem = &umem_odp->umem; - - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - up_write(&per_mm->umem_rwsem); +out_free_map: + if (ib_uses_virt_dma(dev)) + kvfree(map->pfn_list); + else + hmm_dma_map_free(dev->dma_device, map); + return ret; } -static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) +/** + * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem + * + * Implicit ODP umems do not have a VA range and do not have any page lists. + * They exist only to hold the per_mm reference to help the driver create + * children umems. + * + * @device: IB device to create UMEM + * @access: ib_reg_mr access flags + */ +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, + int access) { - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_umem *umem = &umem_odp->umem; - - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); - complete_all(&umem_odp->notifier_completion); - - up_write(&per_mm->umem_rwsem); -} + struct ib_umem *umem; + struct ib_umem_odp *umem_odp; -static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, - struct mm_struct *mm) -{ - struct ib_ucontext_per_mm *per_mm; - int ret; + if (access & IB_ACCESS_HUGETLB) + return ERR_PTR(-EINVAL); - per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); - if (!per_mm) + umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); + if (!umem_odp) return ERR_PTR(-ENOMEM); - - per_mm->context = ctx; - per_mm->mm = mm; - per_mm->umem_tree = RB_ROOT_CACHED; - init_rwsem(&per_mm->umem_rwsem); - per_mm->active = ctx->invalidate_range; - - rcu_read_lock(); - per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - - WARN_ON(mm != current->mm); - - per_mm->mn.ops = &ib_umem_notifiers; - ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); - if (ret) { - dev_err(&ctx->device->dev, - "Failed to register mmu_notifier %d\n", ret); - goto out_pid; - } - - list_add(&per_mm->ucontext_list, &ctx->per_mm_list); - return per_mm; - -out_pid: - put_pid(per_mm->tgid); - kfree(per_mm); - return ERR_PTR(ret); -} - -static int get_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext *ctx = umem_odp->umem.context; - struct ib_ucontext_per_mm *per_mm; - - /* - * Generally speaking we expect only one or two per_mm in this list, - * so no reason to optimize this search today. - */ - mutex_lock(&ctx->per_mm_list_lock); - list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { - if (per_mm->mm == umem_odp->umem.owning_mm) - goto found; - } - - per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); - if (IS_ERR(per_mm)) { - mutex_unlock(&ctx->per_mm_list_lock); - return PTR_ERR(per_mm); - } - -found: - umem_odp->per_mm = per_mm; - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); - - return 0; -} - -static void free_per_mm(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); + umem = &umem_odp->umem; + umem->ibdev = device; + umem->writable = ib_access_writable(access); + umem->owning_mm = current->mm; + umem_odp->page_shift = PAGE_SHIFT; + + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ib_init_umem_implicit_odp(umem_odp); + return umem_odp; } +EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); -void put_per_mm(struct ib_umem_odp *umem_odp) +/** + * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit + * parent ODP umem + * + * @root: The parent umem enclosing the child. This must be allocated using + * ib_alloc_implicit_odp_umem() + * @addr: The starting userspace VA + * @size: The length of the userspace VA + * @ops: MMU interval ops, currently only @invalidate + */ +struct ib_umem_odp * +ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, + size_t size, + const struct mmu_interval_notifier_ops *ops) { - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_ucontext *ctx = umem_odp->umem.context; - bool need_free; - - mutex_lock(&ctx->per_mm_list_lock); - umem_odp->per_mm = NULL; - per_mm->odp_mrs_count--; - need_free = per_mm->odp_mrs_count == 0; - if (need_free) - list_del(&per_mm->ucontext_list); - mutex_unlock(&ctx->per_mm_list_lock); - - if (!need_free) - return; - /* - * NOTE! mmu_notifier_unregister() can happen between a start/end - * callback, resulting in an start/end, and thus an unbalanced - * lock. This doesn't really matter to us since we are about to kfree - * the memory that holds the lock, however LOCKDEP doesn't like this. + * Caller must ensure that root cannot be freed during the call to + * ib_alloc_odp_umem. */ - down_write(&per_mm->umem_rwsem); - per_mm->active = false; - up_write(&per_mm->umem_rwsem); - - WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); - mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); - put_pid(per_mm->tgid); - mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); -} - -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, - unsigned long addr, size_t size) -{ - struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; struct ib_umem *umem; - int pages = size >> PAGE_SHIFT; int ret; + if (WARN_ON(!root->is_implicit_odp)) + return ERR_PTR(-EINVAL); + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = ctx; + umem->ibdev = root->umem.ibdev; umem->length = size; umem->address = addr; - umem->page_shift = PAGE_SHIFT; - umem->writable = 1; - umem->is_odp = 1; - odp_data->per_mm = per_mm; - - mutex_init(&odp_data->umem_mutex); - init_completion(&odp_data->notifier_completion); - - odp_data->page_list = - vzalloc(array_size(pages, sizeof(*odp_data->page_list))); - if (!odp_data->page_list) { - ret = -ENOMEM; - goto out_odp_data; - } - - odp_data->dma_list = - vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); - if (!odp_data->dma_list) { - ret = -ENOMEM; - goto out_page_list; - } + umem->writable = root->umem.writable; + umem->owning_mm = root->umem.owning_mm; + odp_data->page_shift = PAGE_SHIFT; + odp_data->notifier.ops = ops; /* - * Caller must ensure that the umem_odp that the per_mm came from - * cannot be freed during the call to ib_alloc_odp_umem. + * A mmget must be held when registering a notifier, the owming_mm only + * has a mm_grab at this point. */ - mutex_lock(&ctx->per_mm_list_lock); - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); - add_umem_to_per_mm(odp_data); + if (!mmget_not_zero(umem->owning_mm)) { + ret = -EFAULT; + goto out_free; + } + odp_data->tgid = get_pid(root->tgid); + ret = ib_init_umem_odp(odp_data, ops); + if (ret) + goto out_tgid; + mmput(umem->owning_mm); return odp_data; -out_page_list: - vfree(odp_data->page_list); -out_odp_data: +out_tgid: + put_pid(odp_data->tgid); + mmput(umem->owning_mm); +out_free: kfree(odp_data); return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_alloc_odp_umem); +EXPORT_SYMBOL(ib_umem_odp_alloc_child); -int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) +/** + * ib_umem_odp_get - Create a umem_odp for a userspace va + * + * @device: IB device struct to get UMEM + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * @ops: MMU interval ops, currently only @invalidate + * + * The driver should use when the access flags indicate ODP memory. It avoids + * pinning, instead, stores the mm for future page fault handling in + * conjunction with MMU notifiers. + */ +struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, + unsigned long addr, size_t size, int access, + const struct mmu_interval_notifier_ops *ops) { - struct ib_umem *umem = &umem_odp->umem; - /* - * NOTE: This must called in a process context where umem->owning_mm - * == current->mm - */ - struct mm_struct *mm = umem->owning_mm; - int ret_val; - - if (access & IB_ACCESS_HUGETLB) { - struct vm_area_struct *vma; - struct hstate *h; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, ib_umem_start(umem)); - if (!vma || !is_vm_hugetlb_page(vma)) { - up_read(&mm->mmap_sem); - return -EINVAL; - } - h = hstate_vma(vma); - umem->page_shift = huge_page_shift(h); - up_read(&mm->mmap_sem); - umem->hugetlb = 1; - } else { - umem->hugetlb = 0; - } - - mutex_init(&umem_odp->umem_mutex); - - init_completion(&umem_odp->notifier_completion); - - if (ib_umem_num_pages(umem)) { - umem_odp->page_list = - vzalloc(array_size(sizeof(*umem_odp->page_list), - ib_umem_num_pages(umem))); - if (!umem_odp->page_list) - return -ENOMEM; - - umem_odp->dma_list = - vzalloc(array_size(sizeof(*umem_odp->dma_list), - ib_umem_num_pages(umem))); - if (!umem_odp->dma_list) { - ret_val = -ENOMEM; - goto out_page_list; - } - } + struct ib_umem_odp *umem_odp; + int ret; - ret_val = get_per_mm(umem_odp); - if (ret_val) - goto out_dma_list; - add_umem_to_per_mm(umem_odp); + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) + return ERR_PTR(-EINVAL); - return 0; + umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); -out_dma_list: - vfree(umem_odp->dma_list); -out_page_list: - vfree(umem_odp->page_list); - return ret_val; + umem_odp->umem.ibdev = device; + umem_odp->umem.length = size; + umem_odp->umem.address = addr; + umem_odp->umem.writable = ib_access_writable(access); + umem_odp->umem.owning_mm = current->mm; + umem_odp->notifier.ops = ops; + + umem_odp->page_shift = PAGE_SHIFT; +#ifdef CONFIG_HUGETLB_PAGE + if (access & IB_ACCESS_HUGETLB) + umem_odp->page_shift = HPAGE_SHIFT; +#endif + + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ret = ib_init_umem_odp(umem_odp, ops); + if (ret) + goto err_put_pid; + return umem_odp; + +err_put_pid: + put_pid(umem_odp->tgid); + kfree(umem_odp); + return ERR_PTR(ret); } +EXPORT_SYMBOL(ib_umem_odp_get); -void ib_umem_odp_release(struct ib_umem_odp *umem_odp) +static void ib_umem_odp_free(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem = &umem_odp->umem; + struct ib_device *dev = umem_odp->umem.ibdev; /* * Ensure that no more pages are mapped in the umem. @@ -461,111 +281,36 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); - - remove_umem_from_per_mm(umem_odp); - put_per_mm(umem_odp); - vfree(umem_odp->dma_list); - vfree(umem_odp->page_list); + mutex_lock(&umem_odp->umem_mutex); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + mutex_unlock(&umem_odp->umem_mutex); + mmu_interval_notifier_remove(&umem_odp->notifier); + if (ib_uses_virt_dma(dev)) + kvfree(umem_odp->map.pfn_list); + else + hmm_dma_map_free(dev->dma_device, &umem_odp->map); } -/* - * Map for DMA and insert a single page into the on-demand paging page tables. - * - * @umem: the umem to insert the page to. - * @page_index: index in the umem to add the page to. - * @page: the page struct to map and add. - * @access_mask: access permissions needed for this page. - * @current_seq: sequence number for synchronization with invalidations. - * the sequence number is taken from - * umem_odp->notifiers_seq. - * - * The function returns -EFAULT if the DMA mapping operation fails. It returns - * -EAGAIN if a concurrent invalidation prevents us from updating the page. - * - * The page is released via put_page even if the operation failed. For - * on-demand pinning, the page is released whenever it isn't stored in the - * umem. - */ -static int ib_umem_odp_map_dma_single_page( - struct ib_umem_odp *umem_odp, - int page_index, - struct page *page, - u64 access_mask, - unsigned long current_seq) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem = &umem_odp->umem; - struct ib_device *dev = umem->context->device; - dma_addr_t dma_addr; - int stored_page = 0; - int remove_existing_mapping = 0; - int ret = 0; - - /* - * Note: we avoid writing if seq is different from the initial seq, to - * handle case of a racing notifier. This check also allows us to bail - * early if we have a notifier running in parallel with us. - */ - if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { - ret = -EAGAIN; - goto out; - } - if (!(umem_odp->dma_list[page_index])) { - dma_addr = ib_dma_map_page(dev, - page, - 0, BIT(umem->page_shift), - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, dma_addr)) { - ret = -EFAULT; - goto out; - } - umem_odp->dma_list[page_index] = dma_addr | access_mask; - umem_odp->page_list[page_index] = page; - umem->npages++; - stored_page = 1; - } else if (umem_odp->page_list[page_index] == page) { - umem_odp->dma_list[page_index] |= access_mask; - } else { - pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem_odp->page_list[page_index], page); - /* Better remove the mapping now, to prevent any further - * damage. */ - remove_existing_mapping = 1; - } - -out: - /* On Demand Paging - avoid pinning the page */ - if (umem->context->invalidate_range || !stored_page) - put_page(page); - - if (remove_existing_mapping && umem->context->invalidate_range) { - ib_umem_notifier_start_account(umem_odp); - umem->context->invalidate_range( - umem_odp, - ib_umem_start(umem) + (page_index << umem->page_shift), - ib_umem_start(umem) + - ((page_index + 1) << umem->page_shift)); - ib_umem_notifier_end_account(umem_odp); - ret = -EAGAIN; - } + if (!umem_odp->is_implicit_odp) + ib_umem_odp_free(umem_odp); - return ret; + put_pid(umem_odp->tgid); + kfree(umem_odp); } +EXPORT_SYMBOL(ib_umem_odp_release); /** - * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. + * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * - * Pins the range of pages passed in the argument, and maps them to - * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem_odp->dma_list. + * Maps the range passed in the argument to DMA addresses. + * Upon success the ODP MR will be locked to let caller complete its device + * page table update. * * Returns the number of pages mapped in success, negative error code * for failure. - * An -EAGAIN error code is returned when a concurrent mmu notifier prevents - * the function from completing its task. - * An -ENOENT error code indicates that userspace process is being terminated - * and mm was already destroyed. * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be @@ -574,227 +319,151 @@ out: * the return value. * @access_mask: bit mask of the requested access permissions for the given * range. - * @current_seq: the MMU notifiers sequance value for synchronization with - * invalidations. the sequance number is read from - * umem_odp->notifiers_seq before calling this function + * @fault: is faulting required for the given range */ -int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, - u64 bcnt, u64 access_mask, - unsigned long current_seq) +int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, bool fault) + __acquires(&umem_odp->umem_mutex) { - struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = umem_odp->umem.owning_mm; - struct page **local_page_list = NULL; - u64 page_mask, off; - int j, k, ret = 0, start_idx, npages = 0, page_shift; - unsigned int flags = 0; - phys_addr_t p = 0; - - if (access_mask == 0) - return -EINVAL; - - if (user_virt < ib_umem_start(umem) || - user_virt + bcnt > ib_umem_end(umem)) + int pfn_index, dma_index, ret = 0, start_idx; + unsigned int page_shift, hmm_order, pfn_start_idx; + unsigned long num_pfns, current_seq; + struct hmm_range range = {}; + unsigned long timeout; + + if (user_virt < ib_umem_start(umem_odp) || + user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; - local_page_list = (struct page **)__get_free_page(GFP_KERNEL); - if (!local_page_list) - return -ENOMEM; - - page_shift = umem->page_shift; - page_mask = ~(BIT(page_shift) - 1); - off = user_virt & (~page_mask); - user_virt = user_virt & page_mask; - bcnt += off; /* Charge for the first page offset as well. */ + page_shift = umem_odp->page_shift; /* * owning_process is allowed to be NULL, this means somehow the mm is * existing beyond the lifetime of the originating process.. Presumably * mmget_not_zero will fail in this case. */ - owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); - if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { + owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); + if (!owning_process || !mmget_not_zero(owning_mm)) { ret = -EINVAL; goto out_put_task; } - if (access_mask & ODP_WRITE_ALLOWED_BIT) - flags |= FOLL_WRITE; + range.notifier = &umem_odp->notifier; + range.start = ALIGN_DOWN(user_virt, 1UL << page_shift); + range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); + pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + num_pfns = (range.end - range.start) >> PAGE_SHIFT; + if (fault) { + range.default_flags = HMM_PFN_REQ_FAULT; - start_idx = (user_virt - ib_umem_start(umem)) >> page_shift; - k = start_idx; + if (access_mask & HMM_PFN_WRITE) + range.default_flags |= HMM_PFN_REQ_WRITE; + } + + range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); + +retry: + current_seq = range.notifier_seq = + mmu_interval_read_begin(&umem_odp->notifier); - while (bcnt > 0) { - const size_t gup_num_pages = min_t(size_t, - (bcnt + BIT(page_shift) - 1) >> page_shift, - PAGE_SIZE / sizeof(struct page *)); + mmap_read_lock(owning_mm); + ret = hmm_range_fault(&range); + mmap_read_unlock(owning_mm); + if (unlikely(ret)) { + if (ret == -EBUSY && !time_after(jiffies, timeout)) + goto retry; + goto out_put_mm; + } + + start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; + dma_index = start_idx; + + mutex_lock(&umem_odp->umem_mutex); + if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) { + mutex_unlock(&umem_odp->umem_mutex); + goto retry; + } + + for (pfn_index = 0; pfn_index < num_pfns; + pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { - down_read(&owning_mm->mmap_sem); /* - * Note: this might result in redundent page getting. We can - * avoid this by checking dma_list to be 0 before calling - * get_user_pages. However, this make the code much more - * complex (and doesn't gain us much performance in most use - * cases). + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. */ - npages = get_user_pages_remote(owning_process, owning_mm, - user_virt, gup_num_pages, - flags, local_page_list, NULL, NULL); - up_read(&owning_mm->mmap_sem); - - if (npages < 0) { - if (npages != -EAGAIN) - pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); - else - pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); - break; - } + WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) + continue; - bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem_odp->umem_mutex); - for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { - if (user_virt & ~page_mask) { - p += PAGE_SIZE; - if (page_to_phys(local_page_list[j]) != p) { - ret = -EFAULT; - break; - } - put_page(local_page_list[j]); - continue; - } - - ret = ib_umem_odp_map_dma_single_page( - umem_odp, k, local_page_list[j], - access_mask, current_seq); - if (ret < 0) { - if (ret != -EAGAIN) - pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - else - pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } - - p = page_to_phys(local_page_list[j]); - k++; - } - mutex_unlock(&umem_odp->umem_mutex); + if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED) + continue; - if (ret < 0) { - /* Release left over pages when handling errors. */ - for (++j; j < npages; ++j) - put_page(local_page_list[j]); + hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); + /* If a hugepage was detected and ODP wasn't set for, the umem + * page_shift will be used, the opposite case is an error. + */ + if (hmm_order + PAGE_SHIFT < page_shift) { + ret = -EINVAL; + ibdev_dbg(umem_odp->umem.ibdev, + "%s: un-expected hmm_order %u, page_shift %u\n", + __func__, hmm_order, page_shift); break; } } + /* upon success lock should stay on hold for the callee */ + if (!ret) + ret = dma_index - start_idx; + else + mutex_unlock(&umem_odp->umem_mutex); - if (ret >= 0) { - if (npages < 0 && k == start_idx) - ret = npages; - else - ret = k - start_idx; - } - - mmput(owning_mm); +out_put_mm: + mmput_async(owning_mm); out_put_task: if (owning_process) put_task_struct(owning_process); - free_page((unsigned long)local_page_list); return ret; } -EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); +EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - struct ib_umem *umem = &umem_odp->umem; - int idx; + struct ib_device *dev = umem_odp->umem.ibdev; u64 addr; - struct ib_device *dev = umem->context->device; - - virt = max_t(u64, virt, ib_umem_start(umem)); - bound = min_t(u64, bound, ib_umem_end(umem)); - /* Note that during the run of this function, the - * notifiers_count of the MR is > 0, preventing any racing - * faults from completion. We might be racing with other - * invalidations, so we must make sure we free each page only - * once. */ - mutex_lock(&umem_odp->umem_mutex); - for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { - idx = (addr - ib_umem_start(umem)) >> umem->page_shift; - if (umem_odp->page_list[idx]) { - struct page *page = umem_odp->page_list[idx]; - dma_addr_t dma = umem_odp->dma_list[idx]; - dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; - - WARN_ON(!dma_addr); - - ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, - DMA_BIDIRECTIONAL); - if (dma & ODP_WRITE_ALLOWED_BIT) { - struct page *head_page = compound_head(page); - /* - * set_page_dirty prefers being called with - * the page lock. However, MMU notifiers are - * called sometimes with and sometimes without - * the lock. We rely on the umem_mutex instead - * to prevent other mmu notifiers from - * continuing and allowing the page mapping to - * be removed. - */ - set_page_dirty(head_page); - } - /* on demand pinning support */ - if (!umem->context->invalidate_range) - put_page(page); - umem_odp->page_list[idx] = NULL; - umem_odp->dma_list[idx] = 0; - umem->npages--; + + lockdep_assert_held(&umem_odp->umem_mutex); + + virt = max_t(u64, virt, ib_umem_start(umem_odp)); + bound = min_t(u64, bound, ib_umem_end(umem_odp)); + for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { + u64 offset = addr - ib_umem_start(umem_odp); + size_t idx = offset >> umem_odp->page_shift; + unsigned long pfn = umem_odp->map.pfn_list[idx]; + + if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx)) + goto clear; + + if (pfn & HMM_PFN_WRITE) { + struct page *page = hmm_pfn_to_page(pfn); + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); } + umem_odp->npages--; +clear: + umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS; } - mutex_unlock(&umem_odp->umem_mutex); } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); - -/* @last is not a part of the interval. See comment for function - * node_last. - */ -int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, - u64 start, u64 last, - umem_call_back cb, - bool blockable, - void *cookie) -{ - int ret_val = 0; - struct umem_odp_node *node, *next; - struct ib_umem_odp *umem; - - if (unlikely(start == last)) - return ret_val; - - for (node = rbt_ib_umem_iter_first(root, start, last - 1); - node; node = next) { - /* TODO move the blockable decision up to the callback */ - if (!blockable) - return -EAGAIN; - next = rbt_ib_umem_iter_next(node, start, last - 1); - umem = container_of(node, struct ib_umem_odp, interval_tree); - ret_val = cb(umem, start, last, cookie) || ret_val; - } - - return ret_val; -} -EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); - -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, - u64 addr, u64 length) -{ - struct umem_odp_node *node; - - node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); - if (node) - return container_of(node, struct ib_umem_odp, interval_tree); - return NULL; - -} -EXPORT_SYMBOL(rbt_ib_umem_lookup); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index de8d31ab8945..fd67fc9fe85a 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -49,11 +49,13 @@ #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/slab.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <rdma/ib_mad.h> #include <rdma/ib_user_mad.h> +#include <rdma/rdma_netlink.h> #include "core_priv.h" @@ -61,6 +63,8 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); +#define MAX_UMAD_RECV_LIST_SIZE 200000 + enum { IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, IB_UMAD_MAX_AGENTS = 32, @@ -99,7 +103,7 @@ struct ib_umad_port { struct ib_device *ib_dev; struct ib_umad_device *umad_dev; int dev_num; - u8 port_num; + u32 port_num; }; struct ib_umad_device { @@ -111,6 +115,7 @@ struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; struct list_head recv_list; + atomic_t recv_list_size; struct list_head send_list; struct list_head port_list; spinlock_t send_lock; @@ -129,6 +134,14 @@ struct ib_umad_packet { struct ib_user_mad mad; }; +struct ib_rmpp_mad_hdr { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; +} __packed; + +#define CREATE_TRACE_POINTS +#include <trace/events/ib_umad.h> + static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + IB_UMAD_NUM_FIXED_MINOR; @@ -137,7 +150,7 @@ static dev_t dynamic_issm_dev; static DEFINE_IDA(umad_ida); -static void ib_umad_add_one(struct ib_device *device); +static int ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); static void ib_umad_dev_free(struct kref *kref) @@ -160,8 +173,8 @@ static void ib_umad_dev_put(struct ib_umad_device *dev) static int hdr_size(struct ib_umad_file *file) { - return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) : - sizeof (struct ib_user_mad_hdr_old); + return file->use_pkey_index ? sizeof(struct ib_user_mad_hdr) : + sizeof(struct ib_user_mad_hdr_old); } /* caller must hold file->mutex */ @@ -170,24 +183,28 @@ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) return file->agents_dead ? NULL : file->agent[id]; } -static int queue_packet(struct ib_umad_file *file, - struct ib_mad_agent *agent, - struct ib_umad_packet *packet) +static int queue_packet(struct ib_umad_file *file, struct ib_mad_agent *agent, + struct ib_umad_packet *packet, bool is_recv_mad) { int ret = 1; mutex_lock(&file->mutex); + if (is_recv_mad && + atomic_read(&file->recv_list_size) > MAX_UMAD_RECV_LIST_SIZE) + goto unlock; + for (packet->mad.hdr.id = 0; packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { list_add_tail(&packet->list, &file->recv_list); + atomic_inc(&file->recv_list_size); wake_up_interruptible(&file->recv_wait); ret = 0; break; } - +unlock: mutex_unlock(&file->mutex); return ret; @@ -214,7 +231,7 @@ static void send_handler(struct ib_mad_agent *agent, if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { packet->length = IB_MGMT_MAD_HDR; packet->mad.hdr.status = ETIMEDOUT; - if (!queue_packet(file, agent, packet)) + if (!queue_packet(file, agent, packet, false)) return; } kfree(packet); @@ -274,7 +291,7 @@ static void recv_handler(struct ib_mad_agent *agent, rdma_destroy_ah_attr(&ah_attr); } - if (queue_packet(file, agent, packet)) + if (queue_packet(file, agent, packet, true)) goto err2; return; @@ -334,6 +351,9 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, return -EFAULT; } } + + trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr); + return hdr_size(file) + packet->length; } @@ -353,6 +373,9 @@ static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf, if (copy_to_user(buf, packet->mad.data, packet->length)) return -EFAULT; + trace_ib_umad_read_send(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); + return size; } @@ -368,6 +391,11 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, mutex_lock(&file->mutex); + if (file->agents_dead) { + mutex_unlock(&file->mutex); + return -EIO; + } + while (list_empty(&file->recv_list)) { mutex_unlock(&file->mutex); @@ -381,8 +409,14 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, mutex_lock(&file->mutex); } + if (file->agents_dead) { + mutex_unlock(&file->mutex); + return -EIO; + } + packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); list_del(&packet->list); + atomic_dec(&file->recv_list_size); mutex_unlock(&file->mutex); @@ -395,6 +429,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, /* Requeue packet */ mutex_lock(&file->mutex); list_add(&packet->list, &file->recv_list); + atomic_inc(&file->recv_list_size); mutex_unlock(&file->mutex); } else { if (packet->recv_wc) @@ -473,11 +508,11 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_umad_file *file = filp->private_data; + struct ib_rmpp_mad_hdr *rmpp_mad_hdr; struct ib_umad_packet *packet; struct ib_mad_agent *agent; struct rdma_ah_attr ah_attr; struct ib_ah *ah; - struct ib_rmpp_mad *rmpp_mad; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; u8 base_version; @@ -485,7 +520,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) return -EINVAL; - packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); + packet = kzalloc(sizeof(*packet) + IB_MGMT_RMPP_HDR, GFP_KERNEL); if (!packet) return -ENOMEM; @@ -508,9 +543,12 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, mutex_lock(&file->mutex); + trace_ib_umad_write(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); + agent = __get_agent(file, packet->mad.hdr.id); if (!agent) { - ret = -EINVAL; + ret = -EIO; goto err_up; } @@ -536,13 +574,13 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err_up; } - rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; - hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); + rmpp_mad_hdr = (struct ib_rmpp_mad_hdr *)packet->mad.data; + hdr_len = ib_get_mad_data_offset(rmpp_mad_hdr->mad_hdr.mgmt_class); - if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + if (ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class) && ib_mad_kernel_rmpp_agent(agent)) { copy_offset = IB_MGMT_RMPP_HDR; - rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + rmpp_active = ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE; } else { copy_offset = IB_MGMT_MAD_HDR; @@ -591,12 +629,12 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | (be64_to_cpup(tid) & 0xffffffff)); - rmpp_mad->mad_hdr.tid = *tid; + rmpp_mad_hdr->mad_hdr.tid = *tid; } if (!ib_mad_kernel_rmpp_agent(agent) - && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) - && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + && ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { spin_lock_irq(&file->send_lock); list_add_tail(&packet->list, &file->send_list); spin_unlock_irq(&file->send_lock); @@ -639,10 +677,14 @@ static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait) /* we will always be able to post a MAD send */ __poll_t mask = EPOLLOUT | EPOLLWRNORM; + mutex_lock(&file->mutex); poll_wait(filp, &file->recv_wait, wait); if (!list_empty(&file->recv_list)) mask |= EPOLLIN | EPOLLRDNORM; + if (file->agents_dead) + mask = EPOLLERR; + mutex_unlock(&file->mutex); return mask; } @@ -660,8 +702,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, mutex_lock(&file->mutex); if (!file->port->ib_dev) { - dev_notice(&file->port->dev, - "ib_umad_reg_agent: invalid device\n"); + dev_notice(&file->port->dev, "%s: invalid device\n", __func__); ret = -EPIPE; goto out; } @@ -673,7 +714,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, if (ureq.qpn != 0 && ureq.qpn != 1) { dev_notice(&file->port->dev, - "ib_umad_reg_agent: invalid QPN %d specified\n", + "%s: invalid QPN %u specified\n", __func__, ureq.qpn); ret = -EINVAL; goto out; @@ -683,9 +724,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, if (!__get_agent(file, agent_id)) goto found; - dev_notice(&file->port->dev, - "ib_umad_reg_agent: Max Agents (%u) reached\n", + dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; goto out; @@ -732,7 +773,7 @@ found: "process %s did not enable P_Key index support.\n", current->comm); dev_warn(&file->port->dev, - " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); + " Documentation/infiniband/user_mad.rst has info on the new ABI.\n"); } } @@ -762,8 +803,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) mutex_lock(&file->mutex); if (!file->port->ib_dev) { - dev_notice(&file->port->dev, - "ib_umad_reg_agent2: invalid device\n"); + dev_notice(&file->port->dev, "%s: invalid device\n", __func__); ret = -EPIPE; goto out; } @@ -774,17 +814,16 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) } if (ureq.qpn != 0 && ureq.qpn != 1) { - dev_notice(&file->port->dev, - "ib_umad_reg_agent2: invalid QPN %d specified\n", - ureq.qpn); + dev_notice(&file->port->dev, "%s: invalid QPN %u specified\n", + __func__, ureq.qpn); ret = -EINVAL; goto out; } if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { dev_notice(&file->port->dev, - "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n", - ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + "%s failed: invalid registration flags specified 0x%x; supported 0x%x\n", + __func__, ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); ret = -EINVAL; if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP, @@ -799,8 +838,7 @@ static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) if (!__get_agent(file, agent_id)) goto found; - dev_notice(&file->port->dev, - "ib_umad_reg_agent2: Max Agents (%u) reached\n", + dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; @@ -812,7 +850,7 @@ found: req.mgmt_class_version = ureq.mgmt_class_version; if (ureq.oui & 0xff000000) { dev_notice(&file->port->dev, - "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n", + "%s failed: oui invalid 0x%08x\n", __func__, ureq.oui); ret = -EINVAL; goto out; @@ -871,11 +909,14 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) if (get_user(id, arg)) return -EFAULT; + if (id >= IB_UMAD_MAX_AGENTS) + return -EINVAL; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); - if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { + id = array_index_nospec(id, IB_UMAD_MAX_AGENTS); + if (!__get_agent(file, id)) { ret = -EINVAL; goto out; } @@ -957,19 +998,27 @@ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret = -ENXIO; + int ret = 0; port = container_of(inode->i_cdev, struct ib_umad_port, cdev); mutex_lock(&port->file_mutex); - if (!port->ib_dev) + if (!port->ib_dev) { + ret = -ENXIO; goto out; + } - ret = -ENOMEM; - file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) + if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto out; + } + + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) { + ret = -ENOMEM; goto out; + } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); @@ -982,14 +1031,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); - ret = nonseekable_open(inode, filp); - if (ret) { - list_del(&file->port_list); - kfree(file); - goto out; - } - - ib_umad_dev_get(port->umad_dev); + stream_open(inode, filp); out: mutex_unlock(&port->file_mutex); return ret; @@ -998,7 +1040,6 @@ out: static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; - struct ib_umad_device *dev = file->port->umad_dev; struct ib_umad_packet *packet, *tmp; int already_dead; int i; @@ -1025,9 +1066,8 @@ static int ib_umad_close(struct inode *inode, struct file *filp) ib_unregister_mad_agent(file->agent[i]); mutex_unlock(&file->port->file_mutex); - + mutex_destroy(&file->mutex); kfree(file); - ib_umad_dev_put(dev); return 0; } @@ -1042,7 +1082,6 @@ static const struct file_operations umad_fops = { #endif .open = ib_umad_open, .release = ib_umad_close, - .llseek = no_llseek, }; static int ib_umad_sm_open(struct inode *inode, struct file *filp) @@ -1067,23 +1106,20 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) } } + if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto err_up_sem; + } + ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); if (ret) goto err_up_sem; filp->private_data = port; - ret = nonseekable_open(inode, filp); - if (ret) - goto err_clr_sm_cap; - - ib_umad_dev_get(port->umad_dev); + nonseekable_open(inode, filp); return 0; -err_clr_sm_cap: - swap(props.set_port_cap_mask, props.clr_port_cap_mask); - ib_modify_port(port->ib_dev, port->port_num, 0, &props); - err_up_sem: up(&port->sm_sem); @@ -1106,7 +1142,6 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) up(&port->sm_sem); - ib_umad_dev_put(port->umad_dev); return ret; } @@ -1114,14 +1149,61 @@ static const struct file_operations umad_sm_fops = { .owner = THIS_MODULE, .open = ib_umad_sm_open, .release = ib_umad_sm_close, - .llseek = no_llseek, }; +static struct ib_umad_port *get_port(struct ib_device *ibdev, + struct ib_umad_device *umad_dev, + u32 port) +{ + if (!umad_dev) + return ERR_PTR(-EOPNOTSUPP); + if (!rdma_is_port_valid(ibdev, port)) + return ERR_PTR(-EINVAL); + if (!rdma_cap_ib_mad(ibdev, port)) + return ERR_PTR(-EOPNOTSUPP); + + return &umad_dev->ports[port - rdma_start_port(ibdev)]; +} + +static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_port *port = get_port(ibdev, client_data, res->port); + + if (IS_ERR(port)) + return PTR_ERR(port); + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &port->dev; + return 0; +} + static struct ib_client umad_client = { .name = "umad", .add = ib_umad_add_one, - .remove = ib_umad_remove_one + .remove = ib_umad_remove_one, + .get_nl_info = ib_umad_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("umad"); + +static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_port *port = get_port(ibdev, client_data, res->port); + + if (IS_ERR(port)) + return PTR_ERR(port); + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &port->sm_dev; + return 0; +} + +static struct ib_client issm_client = { + .name = "issm", + .get_nl_info = ib_issm_get_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("issm"); static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1131,7 +1213,7 @@ static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev)); + return sysfs_emit(buf, "%s\n", dev_name(&port->ib_dev->dev)); } static DEVICE_ATTR_RO(ibdev); @@ -1143,7 +1225,7 @@ static ssize_t port_show(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%d\n", port->port_num); + return sysfs_emit(buf, "%d\n", port->port_num); } static DEVICE_ATTR_RO(port); @@ -1154,15 +1236,15 @@ static struct attribute *umad_class_dev_attrs[] = { }; ATTRIBUTE_GROUPS(umad_class_dev); -static char *umad_devnode(struct device *dev, umode_t *mode) +static char *umad_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } -static ssize_t abi_version_show(struct class *class, - struct class_attribute *attr, char *buf) +static ssize_t abi_version_show(const struct class *class, + const struct class_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION); + return sysfs_emit(buf, "%d\n", IB_USER_MAD_ABI_VERSION); } static CLASS_ATTR_RO(abi_version); @@ -1237,15 +1319,17 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (ret) goto err_cdev; - ib_umad_init_port_dev(&port->sm_dev, port, device); - port->sm_dev.devt = base_issm; - dev_set_name(&port->sm_dev, "issm%d", port->dev_num); - cdev_init(&port->sm_cdev, &umad_sm_fops); - port->sm_cdev.owner = THIS_MODULE; + if (rdma_cap_ib_smi(device, port_num)) { + ib_umad_init_port_dev(&port->sm_dev, port, device); + port->sm_dev.devt = base_issm; + dev_set_name(&port->sm_dev, "issm%d", port->dev_num); + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; - ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); - if (ret) - goto err_dev; + ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); + if (ret) + goto err_dev; + } return 0; @@ -1261,8 +1345,15 @@ err_cdev: static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; + bool has_smi = false; int id; + if (rdma_cap_ib_smi(port->ib_dev, port->port_num)) { + cdev_device_del(&port->sm_cdev, &port->sm_dev); + has_smi = true; + } + cdev_device_del(&port->cdev, &port->dev); + mutex_lock(&port->file_mutex); /* Mark ib_dev NULL and block ioctl or other file ops to progress @@ -1273,6 +1364,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) list_for_each_entry(file, &port->file_list, port_list) { mutex_lock(&file->mutex); file->agents_dead = 1; + wake_up_interruptible(&file->recv_wait); mutex_unlock(&file->mutex); for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) @@ -1282,44 +1374,51 @@ static void ib_umad_kill_port(struct ib_umad_port *port) mutex_unlock(&port->file_mutex); - cdev_device_del(&port->sm_cdev, &port->sm_dev); - put_device(&port->sm_dev); - cdev_device_del(&port->cdev, &port->dev); - put_device(&port->dev); ida_free(&umad_ida, port->dev_num); + + /* balances device_initialize() */ + if (has_smi) + put_device(&port->sm_dev); + put_device(&port->dev); } -static void ib_umad_add_one(struct ib_device *device) +static int ib_umad_add_one(struct ib_device *device) { struct ib_umad_device *umad_dev; int s, e, i; int count = 0; + int ret; s = rdma_start_port(device); e = rdma_end_port(device); - umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL); + umad_dev = kzalloc(struct_size(umad_dev, ports, + size_add(size_sub(e, s), 1)), + GFP_KERNEL); if (!umad_dev) - return; + return -ENOMEM; kref_init(&umad_dev->kref); for (i = s; i <= e; ++i) { if (!rdma_cap_ib_mad(device, i)) continue; - if (ib_umad_init_port(device, i, umad_dev, - &umad_dev->ports[i - s])) + ret = ib_umad_init_port(device, i, umad_dev, + &umad_dev->ports[i - s]); + if (ret) goto err; count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; + } ib_set_client_data(device, &umad_client, umad_dev); - return; + return 0; err: while (--i >= s) { @@ -1329,21 +1428,22 @@ err: ib_umad_kill_port(&umad_dev->ports[i - s]); } free: + /* balances kref_init */ ib_umad_dev_put(umad_dev); + return ret; } static void ib_umad_remove_one(struct ib_device *device, void *client_data) { struct ib_umad_device *umad_dev = client_data; - int i; + unsigned int i; - if (!umad_dev) - return; - - for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) { - if (rdma_cap_ib_mad(device, i + rdma_start_port(device))) - ib_umad_kill_port(&umad_dev->ports[i]); + rdma_for_each_port (device, i) { + if (rdma_cap_ib_mad(device, i)) + ib_umad_kill_port( + &umad_dev->ports[i - rdma_start_port(device)]); } + /* balances kref_init() */ ib_umad_dev_put(umad_dev); } @@ -1375,13 +1475,17 @@ static int __init ib_umad_init(void) } ret = ib_register_client(&umad_client); - if (ret) { - pr_err("couldn't register ib_umad client\n"); + if (ret) goto out_class; - } + + ret = ib_register_client(&issm_client); + if (ret) + goto out_client; return 0; +out_client: + ib_unregister_client(&umad_client); out_class: class_unregister(&umad_class); @@ -1399,6 +1503,7 @@ out: static void __exit ib_umad_cleanup(void) { + ib_unregister_client(&issm_client); ib_unregister_client(&umad_client); class_unregister(&umad_class); unregister_chrdev_region(base_umad_dev, diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index ea0bc6885517..797e2fcc8072 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -97,8 +97,8 @@ ib_uverbs_init_udata_buf_or_null(struct ib_udata *udata, */ struct ib_uverbs_device { - atomic_t refcount; - int num_comp_vectors; + refcount_t refcount; + u32 num_comp_vectors; struct completion comp; struct device dev; /* First group for device attributes, NULL terminated array */ @@ -111,7 +111,6 @@ struct ib_uverbs_device { struct srcu_struct disassociate_srcu; struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; - struct list_head uverbs_events_file_list; struct uverbs_api *uapi; }; @@ -124,10 +123,9 @@ struct ib_uverbs_event_queue { }; struct ib_uverbs_async_event_file { + struct ib_uobject uobj; struct ib_uverbs_event_queue ev_queue; - struct ib_uverbs_file *uverbs_file; - struct kref ref; - struct list_head list; + struct ib_event_handler event_handler; }; struct ib_uverbs_completion_event_file { @@ -135,37 +133,6 @@ struct ib_uverbs_completion_event_file { struct ib_uverbs_event_queue ev_queue; }; -struct ib_uverbs_file { - struct kref ref; - struct ib_uverbs_device *device; - struct mutex ucontext_lock; - /* - * ucontext must be accessed via ib_uverbs_get_ucontext() or with - * ucontext_lock held - */ - struct ib_ucontext *ucontext; - struct ib_event_handler event_handler; - struct ib_uverbs_async_event_file *async_file; - struct list_head list; - - /* - * To access the uobjects list hw_destroy_rwsem must be held for write - * OR hw_destroy_rwsem held for read AND uobjects_lock held. - * hw_destroy_rwsem should be called across any destruction of the HW - * object of an associated uobject. - */ - struct rw_semaphore hw_destroy_rwsem; - spinlock_t uobjects_lock; - struct list_head uobjects; - - struct mutex umap_lock; - struct list_head umaps; - - struct idr idr; - /* spinlock protects write access to idr */ - spinlock_t idr_lock; -}; - struct ib_uverbs_event { union { struct ib_uverbs_async_event_desc async; @@ -184,6 +151,8 @@ struct ib_uverbs_mcast_entry { struct ib_uevent_object { struct ib_uobject uobject; + struct ib_uverbs_async_event_file *event_file; + /* List member for ib_uverbs_async_event_file list */ struct list_head event_list; u32 events_reported; }; @@ -211,36 +180,38 @@ struct ib_uwq_object { }; struct ib_ucq_object { - struct ib_uobject uobject; + struct ib_uevent_object uevent; struct list_head comp_list; - struct list_head async_list; u32 comp_events_reported; - u32 async_events_reported; }; extern const struct file_operations uverbs_event_fops; +extern const struct file_operations uverbs_async_event_fops; void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); -struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, - struct ib_device *ib_dev); -void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file); +void ib_uverbs_init_async_event_file(struct ib_uverbs_async_event_file *ev_file); +void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue); void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); +int uverbs_async_event_release(struct inode *inode, struct file *filp); -void ib_uverbs_release_ucq(struct ib_uverbs_file *file, - struct ib_uverbs_completion_event_file *ev_file, +int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs); +int ib_init_ucontext(struct uverbs_attr_bundle *attrs); + +void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, struct ib_ucq_object *uobj); -void ib_uverbs_release_uevent(struct ib_uverbs_file *file, - struct ib_uevent_object *uobj); +void ib_uverbs_release_uevent(struct ib_uevent_object *uobj); void ib_uverbs_release_file(struct kref *ref); +void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, + __u64 element, __u64 event, + struct list_head *obj_list, u32 *counter); void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); -void ib_uverbs_event_handler(struct ib_event_handler *handler, - struct ib_event *event); int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, - enum rdma_remove_reason why); + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs); int uverbs_dealloc_mw(struct ib_mw *mw); void ib_uverbs_detach_umcast(struct ib_qp *qp, @@ -276,23 +247,6 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, size_t kern_filter_sz, union ib_flow_spec *ib_spec); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DEVICE); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_PD); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MR); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_CQ); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_QP); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_AH); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_MW); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_SRQ); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_WQ); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM); -extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS); - /* * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the * PortInfo CapabilityMask, but was extended with unique bits. @@ -314,6 +268,24 @@ static inline u32 make_port_cap_flags(const struct ib_port_attr *attr) return res; } +static inline struct ib_uverbs_async_event_file * +ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs, + u16 id) +{ + struct ib_uobject *async_ev_file_uobj; + struct ib_uverbs_async_event_file *async_ev_file; + + async_ev_file_uobj = uverbs_attr_get_uobject(attrs, id); + if (IS_ERR(async_ev_file_uobj)) + async_ev_file = READ_ONCE(attrs->ufile->default_async_file); + else + async_ev_file = container_of(async_ev_file_uobj, + struct ib_uverbs_async_event_file, + uobj); + if (async_ev_file) + uverbs_uobject_get(&async_ev_file->uobj); + return async_ev_file; +} void copy_port_attr_to_resp(struct ib_port_attr *attr, struct ib_uverbs_query_port_resp *resp, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3317300ab036..ce16404cdfb8 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -42,6 +42,7 @@ #include <rdma/uverbs_types.h> #include <rdma/uverbs_std_types.h> +#include <rdma/ib_ucaps.h> #include "rdma_core.h" #include "uverbs.h" @@ -161,8 +162,8 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter, { const void __user *res = iter->cur; - if (iter->cur + len > iter->end) - return ERR_PTR(-ENOSPC); + if (len > iter->end - iter->cur) + return (void __force __user *)ERR_PTR(-ENOSPC); iter->cur += len; return res; } @@ -174,14 +175,25 @@ static int uverbs_request_finish(struct uverbs_req_iter *iter) return 0; } +/* + * When calling a destroy function during an error unwind we need to pass in + * the udata that is sanitized of all user arguments. Ie from the driver + * perspective it looks like no udata was passed. + */ +struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs) +{ + attrs->driver_udata = (struct ib_udata){}; + return &attrs->driver_udata; +} + static struct ib_uverbs_completion_event_file * -_ib_uverbs_lookup_comp_file(s32 fd, const struct uverbs_attr_bundle *attrs) +_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL, fd, attrs); if (IS_ERR(uobj)) - return (void *)uobj; + return ERR_CAST(uobj); uverbs_uobject_get(uobj); uobj_put_read(uobj); @@ -192,81 +204,74 @@ _ib_uverbs_lookup_comp_file(s32 fd, const struct uverbs_attr_bundle *attrs) #define ib_uverbs_lookup_comp_file(_fd, _ufile) \ _ib_uverbs_lookup_comp_file((_fd)*typecheck(s32, _fd), _ufile) -static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) +int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_file *file = attrs->ufile; - struct ib_uverbs_get_context cmd; - struct ib_uverbs_get_context_resp resp; - struct ib_ucontext *ucontext; - struct file *filp; - struct ib_rdmacg_object cg_obj; + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_ucontext *ucontext; struct ib_device *ib_dev; - int ret; - ret = uverbs_request(attrs, &cmd, sizeof(cmd)); - if (ret) - return ret; + ib_dev = srcu_dereference(ufile->device->ib_dev, + &ufile->device->disassociate_srcu); + if (!ib_dev) + return -EIO; - mutex_lock(&file->ucontext_lock); - ib_dev = srcu_dereference(file->device->ib_dev, - &file->device->disassociate_srcu); - if (!ib_dev) { - ret = -EIO; - goto err; - } + ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext); + if (!ucontext) + return -ENOMEM; + + ucontext->device = ib_dev; + ucontext->ufile = ufile; + xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC); + + rdma_restrack_new(&ucontext->res, RDMA_RESTRACK_CTX); + rdma_restrack_set_name(&ucontext->res, NULL); + attrs->context = ucontext; + return 0; +} + +int ib_init_ucontext(struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = attrs->context; + struct ib_uverbs_file *file = attrs->ufile; + int *fd_array; + int fd_count; + int ret; + if (!down_read_trylock(&file->hw_destroy_rwsem)) + return -EIO; + mutex_lock(&file->ucontext_lock); if (file->ucontext) { ret = -EINVAL; goto err; } - ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); + ret = ib_rdmacg_try_charge(&ucontext->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_HANDLE); if (ret) goto err; - ucontext = ib_dev->ops.alloc_ucontext(ib_dev, &attrs->driver_udata); - if (IS_ERR(ucontext)) { - ret = PTR_ERR(ucontext); - goto err_alloc; - } - - ucontext->device = ib_dev; - ucontext->cg_obj = cg_obj; - /* ufile is required when some objects are released */ - ucontext->ufile = file; - - ucontext->closing = false; - ucontext->cleanup_retryable = false; - -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - mutex_init(&ucontext->per_mm_list_lock); - INIT_LIST_HEAD(&ucontext->per_mm_list); - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) - ucontext->invalidate_range = NULL; - -#endif - - resp.num_comp_vectors = file->device->num_comp_vectors; - - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) - goto err_free; - resp.async_fd = ret; + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_GET_CONTEXT_FD_ARR)) { + fd_count = uverbs_attr_ptr_get_array_size(attrs, + UVERBS_ATTR_GET_CONTEXT_FD_ARR, + sizeof(int)); + if (fd_count < 0) { + ret = fd_count; + goto err_uncharge; + } - filp = ib_uverbs_alloc_async_event_file(file, ib_dev); - if (IS_ERR(filp)) { - ret = PTR_ERR(filp); - goto err_fd; + fd_array = uverbs_attr_get_alloced_ptr(attrs, + UVERBS_ATTR_GET_CONTEXT_FD_ARR); + ret = ib_get_ucaps(fd_array, fd_count, &ucontext->enabled_caps); + if (ret) + goto err_uncharge; } - ret = uverbs_response(attrs, &resp, sizeof(resp)); + ret = ucontext->device->ops.alloc_ucontext(ucontext, + &attrs->driver_udata); if (ret) - goto err_file; + goto err_uncharge; - fd_install(resp.async_fd, filp); - - ucontext->res.type = RDMA_RESTRACK_CTX; - rdma_restrack_uadd(&ucontext->res); + rdma_restrack_add(&ucontext->res); /* * Make sure that ib_uverbs_get_ucontext() sees the pointer update @@ -275,24 +280,63 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) smp_store_release(&file->ucontext, ucontext); mutex_unlock(&file->ucontext_lock); - + up_read(&file->hw_destroy_rwsem); return 0; -err_file: - ib_uverbs_free_async_event_file(file); - fput(filp); +err_uncharge: + ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_HANDLE); +err: + mutex_unlock(&file->ucontext_lock); + up_read(&file->hw_destroy_rwsem); + return ret; +} -err_fd: - put_unused_fd(resp.async_fd); +static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_get_context_resp resp; + struct ib_uverbs_get_context cmd; + struct ib_device *ib_dev; + struct ib_uobject *uobj; + int ret; -err_free: - ib_dev->ops.dealloc_ucontext(ucontext); + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; -err_alloc: - ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); + ret = ib_alloc_ucontext(attrs); + if (ret) + return ret; -err: - mutex_unlock(&file->ucontext_lock); + uobj = uobj_alloc(UVERBS_OBJECT_ASYNC_EVENT, attrs, &ib_dev); + if (IS_ERR(uobj)) { + ret = PTR_ERR(uobj); + goto err_ucontext; + } + + resp = (struct ib_uverbs_get_context_resp){ + .num_comp_vectors = attrs->ufile->device->num_comp_vectors, + .async_fd = uobj->id, + }; + ret = uverbs_response(attrs, &resp, sizeof(resp)); + if (ret) + goto err_uobj; + + ret = ib_init_ucontext(attrs); + if (ret) + goto err_uobj; + + ib_uverbs_init_async_event_file( + container_of(uobj, struct ib_uverbs_async_event_file, uobj)); + rdma_alloc_commit_uobject(uobj, attrs); + return 0; + +err_uobj: + rdma_alloc_abort_uobject(uobj, attrs, false); +err_ucontext: + rdma_restrack_put(&attrs->context->res); + kfree(attrs->context); + attrs->context = NULL; return ret; } @@ -312,7 +356,7 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->hw_ver = attr->hw_ver; resp->max_qp = attr->max_qp; resp->max_qp_wr = attr->max_qp_wr; - resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); + resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge); resp->max_sge_rd = attr->max_sge_rd; resp->max_cq = attr->max_cq; @@ -334,14 +378,12 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; resp->max_ah = attr->max_ah; - resp->max_fmr = attr->max_fmr; - resp->max_map_per_fmr = attr->max_map_per_fmr; resp->max_srq = attr->max_srq; resp->max_srq_wr = attr->max_srq_wr; resp->max_srq_sge = attr->max_srq_sge; resp->max_pkeys = attr->max_pkeys; resp->local_ca_ack_delay = attr->local_ca_ack_delay; - resp->phys_port_cnt = ib_dev->phys_port_cnt; + resp->phys_port_cnt = min_t(u32, ib_dev->phys_port_cnt, U8_MAX); } static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs) @@ -395,8 +437,8 @@ static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs) static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) { + struct ib_uverbs_alloc_pd_resp resp = {}; struct ib_uverbs_alloc_pd cmd; - struct ib_uverbs_alloc_pd_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; int ret; @@ -410,34 +452,35 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = ib_dev->ops.alloc_pd(ib_dev, uobj->context, &attrs->driver_udata); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); + pd = rdma_zalloc_drv_obj(ib_dev, ib_pd); + if (!pd) { + ret = -ENOMEM; goto err; } pd->device = ib_dev; pd->uobject = uobj; - pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); - uobj->object = pd; - memset(&resp, 0, sizeof resp); - resp.pd_handle = uobj->id; - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_uadd(&pd->res); + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, NULL); - ret = uverbs_response(attrs, &resp, sizeof(resp)); + ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata); if (ret) - goto err_copy; + goto err_alloc; + rdma_restrack_add(&pd->res); - return uobj_alloc_commit(uobj); + uobj->object = pd; + uobj_finalize_uobj_create(uobj, attrs); -err_copy: - ib_dealloc_pd(pd); + resp.pd_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); +err_alloc: + rdma_restrack_put(&pd->res); + kfree(pd); err: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -541,15 +584,15 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev, static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_device *ibudev = attrs->ufile->device; + struct ib_uverbs_open_xrcd_resp resp = {}; struct ib_uverbs_open_xrcd cmd; - struct ib_uverbs_open_xrcd_resp resp; struct ib_uxrcd_object *obj; struct ib_xrcd *xrcd = NULL; - struct fd f = {NULL, 0}; struct inode *inode = NULL; - int ret = 0; int new_xrcd = 0; struct ib_device *ib_dev; + struct fd f = EMPTY_FD; + int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) @@ -560,12 +603,12 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) if (cmd.fd != -1) { /* search for file descriptor */ f = fdget(cmd.fd); - if (!f.file) { + if (fd_empty(f)) { ret = -EBADF; goto err_tree_mutex_unlock; } - inode = file_inode(f.file); + inode = file_inode(fd_file(f)); xrcd = find_xrcd(ibudev, inode); if (!xrcd && !(cmd.oflags & O_CREAT)) { /* no file descriptor. Need CREATE flag */ @@ -587,25 +630,16 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) } if (!xrcd) { - xrcd = ib_dev->ops.alloc_xrcd(ib_dev, obj->uobject.context, - &attrs->driver_udata); + xrcd = ib_alloc_xrcd_user(ib_dev, inode, &attrs->driver_udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; } - - xrcd->inode = inode; - xrcd->device = ib_dev; - atomic_set(&xrcd->usecnt, 0); - mutex_init(&xrcd->tgt_qp_mutex); - INIT_LIST_HEAD(&xrcd->tgt_qp_list); new_xrcd = 1; } atomic_set(&obj->refcnt, 0); obj->uobject.object = xrcd; - memset(&resp, 0, sizeof resp); - resp.xrcd_handle = obj->uobject.id; if (inode) { if (new_xrcd) { @@ -617,33 +651,22 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) atomic_inc(&xrcd->usecnt); } - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - - if (f.file) - fdput(f); + fdput(f); mutex_unlock(&ibudev->xrcd_tree_mutex); + uobj_finalize_uobj_create(&obj->uobject, attrs); - return uobj_alloc_commit(&obj->uobject); - -err_copy: - if (inode) { - if (new_xrcd) - xrcd_table_delete(ibudev, inode); - atomic_dec(&xrcd->usecnt); - } + resp.xrcd_handle = obj->uobject.id; + return uverbs_response(attrs, &resp, sizeof(resp)); err_dealloc_xrcd: - ib_dealloc_xrcd(xrcd); + ib_dealloc_xrcd_user(xrcd, uverbs_get_cleared_udata(attrs)); err: - uobj_alloc_abort(&obj->uobject); + uobj_alloc_abort(&obj->uobject, attrs); err_tree_mutex_unlock: - if (f.file) - fdput(f); + fdput(f); mutex_unlock(&ibudev->xrcd_tree_mutex); @@ -662,21 +685,20 @@ static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs) return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs); } -int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, - struct ib_xrcd *xrcd, - enum rdma_remove_reason why) +int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct inode *inode; int ret; - struct ib_uverbs_device *dev = uobject->context->ufile->device; + struct ib_uverbs_device *dev = attrs->ufile->device; inode = xrcd->inode; if (inode && !atomic_dec_and_test(&xrcd->usecnt)) return 0; - ret = ib_dealloc_xrcd(xrcd); - - if (ib_is_destroy_retryable(ret, why, uobject)) { + ret = ib_dealloc_xrcd_user(xrcd, &attrs->driver_udata); + if (ret) { atomic_inc(&xrcd->usecnt); return ret; } @@ -684,13 +706,13 @@ int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, if (inode) xrcd_table_delete(dev, inode); - return ret; + return 0; } static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) { + struct ib_uverbs_reg_mr_resp resp = {}; struct ib_uverbs_reg_mr cmd; - struct ib_uverbs_reg_mr_resp resp; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; @@ -704,31 +726,22 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - ret = ib_check_mr_access(cmd.access_flags); - if (ret) - return ret; - uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + ret = ib_check_mr_access(ib_dev, cmd.access_flags); + if (ret) goto err_free; - } - if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { - if (!(pd->device->attrs.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING)) { - pr_debug("ODP support not available\n"); - ret = -EINVAL; - goto err_put; - } + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + goto err_free; } mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, + cmd.access_flags, NULL, &attrs->driver_udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); @@ -737,35 +750,31 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_USER; mr->dm = NULL; + mr->sig_attrs = NULL; mr->uobject = uobj; atomic_inc(&pd->usecnt); - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_uadd(&mr->res); + mr->iova = cmd.hca_va; + mr->length = cmd.length; - uobj->object = mr; - - memset(&resp, 0, sizeof resp); - resp.lkey = mr->lkey; - resp.rkey = mr->rkey; - resp.mr_handle = uobj->id; - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; uobj_put_obj_read(pd); + uobj_finalize_uobj_create(uobj, attrs); - return uobj_alloc_commit(uobj); - -err_copy: - ib_dereg_mr(mr); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + resp.mr_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); err_put: uobj_put_obj_read(pd); - err_free: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -773,23 +782,28 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_rereg_mr cmd; struct ib_uverbs_rereg_mr_resp resp; - struct ib_pd *pd = NULL; struct ib_mr *mr; - struct ib_pd *old_pd; int ret; struct ib_uobject *uobj; + struct ib_uobject *new_uobj; + struct ib_device *ib_dev; + struct ib_pd *orig_pd; + struct ib_pd *new_pd; + struct ib_mr *new_mr; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) return ret; - if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + if (!cmd.flags) return -EINVAL; + if (cmd.flags & ~IB_MR_REREG_SUPPORTED) + return -EOPNOTSUPP; + if ((cmd.flags & IB_MR_REREG_TRANS) && - (!cmd.start || !cmd.hca_va || 0 >= cmd.length || - (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) - return -EINVAL; + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) + return -EINVAL; uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, attrs); if (IS_ERR(uobj)) @@ -803,33 +817,72 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) } if (cmd.flags & IB_MR_REREG_ACCESS) { - ret = ib_check_mr_access(cmd.access_flags); + ret = ib_check_mr_access(mr->device, cmd.access_flags); if (ret) goto put_uobjs; } + orig_pd = mr->pd; if (cmd.flags & IB_MR_REREG_PD) { - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, - attrs); - if (!pd) { - ret = -EINVAL; + new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, + attrs); + if (IS_ERR(new_pd)) { + ret = PTR_ERR(new_pd); goto put_uobjs; } + } else { + new_pd = mr->pd; } - old_pd = mr->pd; - ret = mr->device->ops.rereg_user_mr(mr, cmd.flags, cmd.start, - cmd.length, cmd.hca_va, - cmd.access_flags, pd, - &attrs->driver_udata); - if (!ret) { + /* + * The driver might create a new HW object as part of the rereg, we need + * to have a uobject ready to hold it. + */ + new_uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev); + if (IS_ERR(new_uobj)) { + ret = PTR_ERR(new_uobj); + goto put_uobj_pd; + } + + new_mr = ib_dev->ops.rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, + cmd.hca_va, cmd.access_flags, new_pd, + &attrs->driver_udata); + if (IS_ERR(new_mr)) { + ret = PTR_ERR(new_mr); + goto put_new_uobj; + } + if (new_mr) { + new_mr->device = new_pd->device; + new_mr->pd = new_pd; + new_mr->type = IB_MR_TYPE_USER; + new_mr->uobject = uobj; + atomic_inc(&new_pd->usecnt); + new_uobj->object = new_mr; + + rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&new_mr->res, NULL); + rdma_restrack_add(&new_mr->res); + + /* + * The new uobj for the new HW object is put into the same spot + * in the IDR and the old uobj & HW object is deleted. + */ + rdma_assign_uobject(uobj, new_uobj, attrs); + rdma_alloc_commit_uobject(new_uobj, attrs); + uobj_put_destroy(uobj); + new_uobj = NULL; + uobj = NULL; + mr = new_mr; + } else { if (cmd.flags & IB_MR_REREG_PD) { - atomic_inc(&pd->usecnt); - mr->pd = pd; - atomic_dec(&old_pd->usecnt); + atomic_dec(&orig_pd->usecnt); + mr->pd = new_pd; + atomic_inc(&new_pd->usecnt); + } + if (cmd.flags & IB_MR_REREG_TRANS) { + mr->iova = cmd.hca_va; + mr->length = cmd.length; } - } else { - goto put_uobj_pd; } memset(&resp, 0, sizeof(resp)); @@ -838,12 +891,16 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) ret = uverbs_response(attrs, &resp, sizeof(resp)); +put_new_uobj: + if (new_uobj) + uobj_alloc_abort(new_uobj, attrs); put_uobj_pd: if (cmd.flags & IB_MR_REREG_PD) - uobj_put_obj_read(pd); + uobj_put_obj_read(new_pd); put_uobjs: - uobj_put_write(uobj); + if (uobj) + uobj_put_write(uobj); return ret; } @@ -863,7 +920,7 @@ static int ib_uverbs_dereg_mr(struct uverbs_attr_bundle *attrs) static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_alloc_mw cmd; - struct ib_uverbs_alloc_mw_resp resp; + struct ib_uverbs_alloc_mw_resp resp = {}; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mw *mw; @@ -879,41 +936,47 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) return PTR_ERR(uobj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_free; } - mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata); - if (IS_ERR(mw)) { - ret = PTR_ERR(mw); + if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) { + ret = -EINVAL; goto err_put; } - mw->device = pd->device; - mw->pd = pd; + mw = rdma_zalloc_drv_obj(ib_dev, ib_mw); + if (!mw) { + ret = -ENOMEM; + goto err_put; + } + + mw->device = ib_dev; + mw->pd = pd; mw->uobject = uobj; + mw->type = cmd.mw_type; + + ret = pd->device->ops.alloc_mw(mw, &attrs->driver_udata); + if (ret) + goto err_alloc; + atomic_inc(&pd->usecnt); uobj->object = mw; + uobj_put_obj_read(pd); + uobj_finalize_uobj_create(uobj, attrs); - memset(&resp, 0, sizeof(resp)); - resp.rkey = mw->rkey; + resp.rkey = mw->rkey; resp.mw_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - - uobj_put_obj_read(pd); - return uobj_alloc_commit(uobj); - -err_copy: - uverbs_dealloc_mw(mw); +err_alloc: + kfree(mw); err_put: uobj_put_obj_read(pd); err_free: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -946,39 +1009,33 @@ static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - resp.fd = uobj->id; - ev_file = container_of(uobj, struct ib_uverbs_completion_event_file, uobj); ib_uverbs_init_event_queue(&ev_file->ev_queue); + uobj_finalize_uobj_create(uobj, attrs); - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) { - uobj_alloc_abort(uobj); - return ret; - } - - return uobj_alloc_commit(uobj); + resp.fd = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); } -static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, - struct ib_uverbs_ex_create_cq *cmd) +static int create_cq(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_ex_create_cq *cmd) { struct ib_ucq_object *obj; struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_cq *cq; int ret; - struct ib_uverbs_ex_create_cq_resp resp; + struct ib_uverbs_ex_create_cq_resp resp = {}; struct ib_cq_init_attr attr = {}; struct ib_device *ib_dev; if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors) - return ERR_PTR(-EINVAL); + return -EINVAL; obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs, &ib_dev); if (IS_ERR(obj)) - return obj; + return PTR_ERR(obj); if (cmd->comp_channel >= 0) { ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, attrs); @@ -988,66 +1045,60 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, } } - obj->uobject.user_handle = cmd->user_handle; - obj->comp_events_reported = 0; - obj->async_events_reported = 0; + obj->uevent.uobject.user_handle = cmd->user_handle; INIT_LIST_HEAD(&obj->comp_list); - INIT_LIST_HEAD(&obj->async_list); + INIT_LIST_HEAD(&obj->uevent.event_list); attr.cqe = cmd->cqe; attr.comp_vector = cmd->comp_vector; attr.flags = cmd->flags; - cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context, - &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; goto err_file; } - cq->device = ib_dev; - cq->uobject = &obj->uobject; + cq->uobject = obj; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); - obj->uobject.object = cq; - memset(&resp, 0, sizeof resp); - resp.base.cq_handle = obj->uobject.id; - resp.base.cqe = cq->cqe; - resp.response_length = uverbs_response_length(attrs, sizeof(resp)); - - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_uadd(&cq->res); + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); - ret = uverbs_response(attrs, &resp, sizeof(resp)); + ret = ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) - goto err_cb; + goto err_free; + rdma_restrack_add(&cq->res); - ret = uobj_alloc_commit(&obj->uobject); - if (ret) - return ERR_PTR(ret); - return obj; + obj->uevent.uobject.object = cq; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); -err_cb: - ib_destroy_cq(cq); + resp.base.cq_handle = obj->uevent.uobject.id; + resp.base.cqe = cq->cqe; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); +err_free: + rdma_restrack_put(&cq->res); + kfree(cq); err_file: if (ev_file) - ib_uverbs_release_ucq(attrs->ufile, ev_file, obj); - + ib_uverbs_release_ucq(ev_file, obj); err: - uobj_alloc_abort(&obj->uobject); - - return ERR_PTR(ret); + uobj_alloc_abort(&obj->uevent.uobject, attrs); + return ret; } static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_create_cq cmd; struct ib_uverbs_ex_create_cq cmd_ex; - struct ib_ucq_object *obj; int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -1060,14 +1111,12 @@ static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs) cmd_ex.comp_vector = cmd.comp_vector; cmd_ex.comp_channel = cmd.comp_channel; - obj = create_cq(attrs, &cmd_ex); - return PTR_ERR_OR_ZERO(obj); + return create_cq(attrs, &cmd_ex); } static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_ex_create_cq cmd; - struct ib_ucq_object *obj; int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -1080,8 +1129,7 @@ static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs) if (cmd.reserved) return -EINVAL; - obj = create_cq(attrs, &cmd); - return PTR_ERR_OR_ZERO(obj); + return create_cq(attrs, &cmd); } static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) @@ -1089,15 +1137,15 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) struct ib_uverbs_resize_cq cmd; struct ib_uverbs_resize_cq_resp resp = {}; struct ib_cq *cq; - int ret = -EINVAL; + int ret; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata); if (ret) @@ -1107,7 +1155,8 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) ret = uverbs_response(attrs, &resp, sizeof(resp)); out: - uobj_put_obj_read(cq); + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return ret; } @@ -1157,8 +1206,8 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); /* we copy a struct ib_uverbs_poll_cq_resp to user space */ header_ptr = attrs->ucore.outbuf; @@ -1184,14 +1233,14 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) ret = -EFAULT; goto out_put; } + ret = 0; if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT)) ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT); - ret = 0; - out_put: - uobj_put_obj_read(cq); + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return ret; } @@ -1206,14 +1255,14 @@ static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); - uobj_put_obj_read(cq); - + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return 0; } @@ -1233,10 +1282,10 @@ static int ib_uverbs_destroy_cq(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - obj = container_of(uobj, struct ib_ucq_object, uobject); + obj = container_of(uobj, struct ib_ucq_object, uevent.uobject); memset(&resp, 0, sizeof(resp)); resp.comp_events_reported = obj->comp_events_reported; - resp.async_events_reported = obj->async_events_reported; + resp.async_events_reported = obj->uevent.events_reported; uobj_put_destroy(uobj); @@ -1255,14 +1304,27 @@ static int create_qp(struct uverbs_attr_bundle *attrs, struct ib_srq *srq = NULL; struct ib_qp *qp; struct ib_qp_init_attr attr = {}; - struct ib_uverbs_ex_create_qp_resp resp; + struct ib_uverbs_ex_create_qp_resp resp = {}; int ret; struct ib_rwq_ind_table *ind_tbl = NULL; bool has_sq = true; struct ib_device *ib_dev; - if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) - return -EPERM; + switch (cmd->qp_type) { + case IB_QPT_RAW_PACKET: + if (!rdma_uattrs_has_raw_cap(attrs)) + return -EPERM; + fallthrough; + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + case IB_QPT_DRIVER: + break; + default: + return -EINVAL; + } obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs, &ib_dev); @@ -1276,8 +1338,8 @@ static int create_qp(struct uverbs_attr_bundle *attrs, ind_tbl = uobj_get_obj_read(rwq_ind_table, UVERBS_OBJECT_RWQ_IND_TBL, cmd->rwq_ind_tbl_handle, attrs); - if (!ind_tbl) { - ret = -EINVAL; + if (IS_ERR(ind_tbl)) { + ret = PTR_ERR(ind_tbl); goto err_put; } @@ -1315,8 +1377,10 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (cmd->is_srq) { srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd->srq_handle, attrs); - if (!srq || srq->srq_type == IB_SRQT_XRC) { - ret = -EINVAL; + if (IS_ERR(srq) || + srq->srq_type == IB_SRQT_XRC) { + ret = IS_ERR(srq) ? PTR_ERR(srq) : + -EINVAL; goto err_put; } } @@ -1326,23 +1390,29 @@ static int create_qp(struct uverbs_attr_bundle *attrs, rcq = uobj_get_obj_read( cq, UVERBS_OBJECT_CQ, cmd->recv_cq_handle, attrs); - if (!rcq) { - ret = -EINVAL; + if (IS_ERR(rcq)) { + ret = PTR_ERR(rcq); goto err_put; } } } } - if (has_sq) + if (has_sq) { scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle, attrs); - if (!ind_tbl) + if (IS_ERR(scq)) { + ret = PTR_ERR(scq); + goto err_put; + } + } + + if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI) rcq = rcq ?: scq; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd || (!scq && has_sq)) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put; } @@ -1350,7 +1420,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs, } attr.event_handler = ib_uverbs_qp_event_handler; - attr.qp_context = attrs->ufile; attr.send_cq = scq; attr.recv_cq = rcq; attr.srq = srq; @@ -1358,7 +1427,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs, attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd->qp_type; - attr.create_flags = 0; attr.cap.max_send_wr = cmd->max_send_wr; attr.cap.max_recv_wr = cmd->max_recv_wr; @@ -1366,7 +1434,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs, attr.cap.max_recv_sge = cmd->max_recv_sge; attr.cap.max_inline_data = cmd->max_inline_data; - obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); @@ -1384,7 +1451,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs, } if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { - if (!capable(CAP_NET_RAW)) { + if (!rdma_uattrs_has_raw_cap(attrs)) { ret = -EPERM; goto err_put; } @@ -1392,62 +1459,18 @@ static int create_qp(struct uverbs_attr_bundle *attrs, attr.source_qpn = cmd->source_qpn; } - if (cmd->qp_type == IB_QPT_XRC_TGT) - qp = ib_create_qp(pd, &attr); - else - qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata, - &obj->uevent.uobject); - + qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj, + KBUILD_MODNAME); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } - - if (cmd->qp_type != IB_QPT_XRC_TGT) { - ret = ib_create_qp_security(qp, device); - if (ret) - goto err_cb; - - qp->real_qp = qp; - qp->pd = pd; - qp->send_cq = attr.send_cq; - qp->recv_cq = attr.recv_cq; - qp->srq = attr.srq; - qp->rwq_ind_tbl = ind_tbl; - qp->event_handler = attr.event_handler; - qp->qp_context = attr.qp_context; - qp->qp_type = attr.qp_type; - atomic_set(&qp->usecnt, 0); - atomic_inc(&pd->usecnt); - qp->port = 0; - if (attr.send_cq) - atomic_inc(&attr.send_cq->usecnt); - if (attr.recv_cq) - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); - if (ind_tbl) - atomic_inc(&ind_tbl->usecnt); - } else { - /* It is done in _ib_create_qp for other QP types */ - qp->uobject = &obj->uevent.uobject; - } + ib_qp_usecnt_inc(qp); obj->uevent.uobject.object = qp; - - memset(&resp, 0, sizeof resp); - resp.base.qpn = qp->qp_num; - resp.base.qp_handle = obj->uevent.uobject.id; - resp.base.max_recv_sge = attr.cap.max_recv_sge; - resp.base.max_send_sge = attr.cap.max_send_sge; - resp.base.max_recv_wr = attr.cap.max_recv_wr; - resp.base.max_send_wr = attr.cap.max_send_wr; - resp.base.max_inline_data = attr.cap.max_inline_data; - resp.response_length = uverbs_response_length(attrs, sizeof(resp)); - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_cb; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); if (xrcd) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, @@ -1459,33 +1482,46 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (pd) uobj_put_obj_read(pd); if (scq) - uobj_put_obj_read(scq); + rdma_lookup_put_uobject(&scq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); if (rcq && rcq != scq) - uobj_put_obj_read(rcq); + rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); if (srq) - uobj_put_obj_read(srq); + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); if (ind_tbl) uobj_put_obj_read(ind_tbl); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); - return uobj_alloc_commit(&obj->uevent.uobject); -err_cb: - ib_destroy_qp(qp); + resp.base.qpn = qp->qp_num; + resp.base.qp_handle = obj->uevent.uobject.id; + resp.base.max_recv_sge = attr.cap.max_recv_sge; + resp.base.max_send_sge = attr.cap.max_send_sge; + resp.base.max_recv_wr = attr.cap.max_recv_wr; + resp.base.max_send_wr = attr.cap.max_send_wr; + resp.base.max_inline_data = attr.cap.max_inline_data; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); err_put: if (!IS_ERR(xrcd_uobj)) uobj_put_read(xrcd_uobj); - if (pd) + if (!IS_ERR_OR_NULL(pd)) uobj_put_obj_read(pd); - if (scq) - uobj_put_obj_read(scq); - if (rcq && rcq != scq) - uobj_put_obj_read(rcq); - if (srq) - uobj_put_obj_read(srq); - if (ind_tbl) + if (!IS_ERR_OR_NULL(scq)) + rdma_lookup_put_uobject(&scq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (!IS_ERR_OR_NULL(rcq) && rcq != scq) + rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (!IS_ERR_OR_NULL(srq)) + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (!IS_ERR_OR_NULL(ind_tbl)) uobj_put_obj_read(ind_tbl); - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return ret; } @@ -1537,14 +1573,14 @@ static int ib_uverbs_ex_create_qp(struct uverbs_attr_bundle *attrs) static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) { + struct ib_uverbs_create_qp_resp resp = {}; struct ib_uverbs_open_qp cmd; - struct ib_uverbs_create_qp_resp resp; struct ib_uqp_object *obj; struct ib_xrcd *xrcd; - struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_qp *qp; - struct ib_qp_open_attr attr; + struct ib_qp_open_attr attr = {}; int ret; + struct ib_uobject *xrcd_uobj; struct ib_device *ib_dev; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -1569,11 +1605,9 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) } attr.event_handler = ib_uverbs_qp_event_handler; - attr.qp_context = attrs->ufile; attr.qp_num = cmd.qpn; attr.qp_type = cmd.qp_type; - obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); @@ -1586,27 +1620,20 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) obj->uevent.uobject.object = qp; obj->uevent.uobject.user_handle = cmd.user_handle; - memset(&resp, 0, sizeof resp); - resp.qpn = qp->qp_num; - resp.qp_handle = obj->uevent.uobject.id; - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_destroy; - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); - qp->uobject = &obj->uevent.uobject; + qp->uobject = obj; uobj_put_read(xrcd_uobj); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); - return uobj_alloc_commit(&obj->uevent.uobject); + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; + return uverbs_response(attrs, &resp, sizeof(resp)); -err_destroy: - ib_destroy_qp(qp); err_xrcd: uobj_put_read(xrcd_uobj); err_put: - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return ret; } @@ -1653,14 +1680,15 @@ static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs) } qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); if (ret) goto out; @@ -1758,8 +1786,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -1847,8 +1875,15 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, attr->path_mtu = cmd->base.path_mtu; if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE) attr->path_mig_state = cmd->base.path_mig_state; - if (cmd->base.attr_mask & IB_QP_QKEY) + if (cmd->base.attr_mask & IB_QP_QKEY) { + if (cmd->base.qkey & IB_QP_SET_QKEY && + !(rdma_nl_get_privileged_qkey() || + rdma_uattrs_has_raw_cap(attrs))) { + ret = -EPERM; + goto release_qp; + } attr->qkey = cmd->base.qkey; + } if (cmd->base.attr_mask & IB_QP_RQ_PSN) attr->rq_psn = cmd->base.rq_psn; if (cmd->base.attr_mask & IB_QP_SQ_PSN) @@ -1897,7 +1932,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, &attrs->driver_udata); release_qp: - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); out: kfree(attr); @@ -1913,8 +1949,7 @@ static int ib_uverbs_modify_qp(struct uverbs_attr_bundle *attrs) if (ret) return ret; - if (cmd.base.attr_mask & - ~((IB_USER_LEGACY_LAST_QP_ATTR_MASK << 1) - 1)) + if (cmd.base.attr_mask & ~IB_QP_ATTR_STANDARD_BITS) return -EOPNOTSUPP; return modify_qp(attrs, &cmd); @@ -1936,10 +1971,7 @@ static int ib_uverbs_ex_modify_qp(struct uverbs_attr_bundle *attrs) * Last bit is reserved for extending the attr_mask by * using another field. */ - BUILD_BUG_ON(IB_USER_LAST_QP_ATTR_MASK == (1 << 31)); - - if (cmd.base.attr_mask & - ~((IB_USER_LAST_QP_ATTR_MASK << 1) - 1)) + if (cmd.base.attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT)) return -EOPNOTSUPP; ret = modify_qp(attrs, &cmd); @@ -1976,12 +2008,13 @@ static int ib_uverbs_destroy_qp(struct uverbs_attr_bundle *attrs) static void *alloc_wr(size_t wr_size, __u32 num_sge) { - if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof (struct ib_sge))) / - sizeof (struct ib_sge)) + if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof(struct ib_sge))) / + sizeof(struct ib_sge)) return NULL; - return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) + - num_sge * sizeof (struct ib_sge), GFP_KERNEL); + return kmalloc(ALIGN(wr_size, sizeof(struct ib_sge)) + + num_sge * sizeof(struct ib_sge), + GFP_KERNEL); } static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) @@ -2003,11 +2036,13 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); if (ret) return ret; - wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count); + wqes = uverbs_request_next_ptr(&iter, size_mul(cmd.wqe_size, + cmd.wr_count)); if (IS_ERR(wqes)) return PTR_ERR(wqes); - sgls = uverbs_request_next_ptr( - &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(&iter, + size_mul(cmd.sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return PTR_ERR(sgls); ret = uverbs_request_finish(&iter); @@ -2019,8 +2054,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) return -ENOMEM; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2057,9 +2092,9 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, user_wr->wr.ud.ah, attrs); - if (!ud->ah) { + if (IS_ERR(ud->ah)) { + ret = PTR_ERR(ud->ah); kfree(ud); - ret = -EINVAL; goto out_put; } ud->remote_qpn = user_wr->wr.ud.remote_qpn; @@ -2161,7 +2196,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ret = ret2; out_put: - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); while (wr) { if (is_ud && ud_wr(wr)->ah) @@ -2189,14 +2225,14 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, const struct ib_sge __user *sgls; const void __user *wqes; - if (wqe_size < sizeof (struct ib_uverbs_recv_wr)) + if (wqe_size < sizeof(struct ib_uverbs_recv_wr)) return ERR_PTR(-EINVAL); - wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count); + wqes = uverbs_request_next_ptr(iter, size_mul(wqe_size, wr_count)); if (IS_ERR(wqes)) return ERR_CAST(wqes); - sgls = uverbs_request_next_ptr( - iter, sge_count * sizeof(struct ib_uverbs_sge)); + sgls = uverbs_request_next_ptr(iter, size_mul(sge_count, + sizeof(struct ib_uverbs_sge))); if (IS_ERR(sgls)) return ERR_CAST(sgls); ret = uverbs_request_finish(iter); @@ -2222,14 +2258,14 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, } if (user_wr->num_sge >= - (U32_MAX - ALIGN(sizeof *next, sizeof (struct ib_sge))) / - sizeof (struct ib_sge)) { + (U32_MAX - ALIGN(sizeof(*next), sizeof(struct ib_sge))) / + sizeof(struct ib_sge)) { ret = -EINVAL; goto err; } - next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + - user_wr->num_sge * sizeof (struct ib_sge), + next = kmalloc(ALIGN(sizeof(*next), sizeof(struct ib_sge)) + + user_wr->num_sge * sizeof(struct ib_sge), GFP_KERNEL); if (!next) { ret = -ENOMEM; @@ -2247,8 +2283,8 @@ ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, next->num_sge = user_wr->num_sge; if (next->num_sge) { - next->sg_list = (void *) next + - ALIGN(sizeof *next, sizeof (struct ib_sge)); + next->sg_list = (void *)next + + ALIGN(sizeof(*next), sizeof(struct ib_sge)); if (copy_from_user(next->sg_list, sgls + sg_ind, next->num_sge * sizeof(struct ib_sge))) { @@ -2295,15 +2331,16 @@ static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } resp.bad_wr = 0; ret = qp->device->ops.post_recv(qp->real_qp, wr, &bad_wr); - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); if (ret) { for (next = wr; next; next = next->next) { ++resp.bad_wr; @@ -2345,15 +2382,16 @@ static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) { - ret = -EINVAL; + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); goto out; } resp.bad_wr = 0; ret = srq->device->ops.post_srq_recv(srq, wr, &bad_wr); - uobj_put_obj_read(srq); + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); if (ret) for (next = wr; next; next = next->next) { @@ -2401,8 +2439,8 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err; } @@ -2433,24 +2471,16 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) ah->uobject = uobj; uobj->user_handle = cmd.user_handle; uobj->object = ah; - - resp.ah_handle = uobj->id; - - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - uobj_put_obj_read(pd); - return uobj_alloc_commit(uobj); + uobj_finalize_uobj_create(uobj, attrs); -err_copy: - rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); + resp.ah_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); err_put: uobj_put_obj_read(pd); - err: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -2479,10 +2509,10 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); - obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + obj = qp->uobject; mutex_lock(&obj->mcast_lock); list_for_each_entry(mcast, &obj->mcast_list, list) @@ -2509,7 +2539,8 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs) out_put: mutex_unlock(&obj->mcast_lock); - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return ret; } @@ -2520,7 +2551,7 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret; bool found = false; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -2528,10 +2559,10 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); - obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + obj = qp->uobject; mutex_lock(&obj->mcast_lock); list_for_each_entry(mcast, &obj->mcast_list, list) @@ -2552,7 +2583,8 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) out_put: mutex_unlock(&obj->mcast_lock); - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return ret; } @@ -2632,7 +2664,7 @@ void flow_resources_add(struct ib_uflow_resources *uflow_res, } EXPORT_SYMBOL(flow_resources_add); -static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs, +static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec, struct ib_uflow_resources *uflow_res) @@ -2662,8 +2694,8 @@ static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_FLOW_ACTION, kern_spec->action.handle, attrs); - if (!ib_spec->action.act) - return -EINVAL; + if (IS_ERR(ib_spec->action.act)) + return PTR_ERR(ib_spec->action.act); ib_spec->action.size = sizeof(struct ib_flow_spec_action_handle); flow_resources_add(uflow_res, @@ -2680,8 +2712,8 @@ static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_COUNTERS, kern_spec->flow_count.handle, attrs); - if (!ib_spec->flow_count.counters) - return -EINVAL; + if (IS_ERR(ib_spec->flow_count.counters)) + return PTR_ERR(ib_spec->flow_count.counters); ib_spec->flow_count.size = sizeof(struct ib_flow_spec_action_count); flow_resources_add(uflow_res, @@ -2695,12 +2727,6 @@ static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs, return 0; } -static size_t kern_spec_filter_sz(const struct ib_uverbs_flow_spec_hdr *spec) -{ - /* Returns user space filter size, includes padding */ - return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2; -} - static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size, u16 ib_real_filter_sz) { @@ -2739,7 +2765,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) { case IB_FLOW_SPEC_ETH: - ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_eth_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2750,7 +2776,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_IPV4: - ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_ipv4_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2761,7 +2787,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_IPV6: - ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_ipv6_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2777,7 +2803,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: - ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_tcp_udp_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2788,7 +2814,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_VXLAN_TUNNEL: - ib_filter_sz = offsetof(struct ib_flow_tunnel_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_tunnel_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2803,7 +2829,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, return -EINVAL; break; case IB_FLOW_SPEC_ESP: - ib_filter_sz = offsetof(struct ib_flow_esp_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_esp_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2814,7 +2840,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_GRE: - ib_filter_sz = offsetof(struct ib_flow_gre_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_gre_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2825,7 +2851,7 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz); break; case IB_FLOW_SPEC_MPLS: - ib_filter_sz = offsetof(struct ib_flow_mpls_filter, real_sz); + ib_filter_sz = sizeof(struct ib_flow_mpls_filter); actual_filter_sz = spec_filter_size(kern_spec_mask, kern_filter_sz, ib_filter_sz); @@ -2844,11 +2870,16 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { - ssize_t kern_filter_sz; + size_t kern_filter_sz; void *kern_spec_mask; void *kern_spec_val; - kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); + if (check_sub_overflow((size_t)kern_spec->hdr.size, + sizeof(struct ib_uverbs_flow_spec_hdr), + &kern_filter_sz)) + return -EINVAL; + + kern_filter_sz /= 2; kern_spec_val = (void *)kern_spec + sizeof(struct ib_uverbs_flow_spec_hdr); @@ -2900,26 +2931,25 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) return PTR_ERR(obj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - err = -EINVAL; + if (IS_ERR(pd)) { + err = PTR_ERR(pd); goto err_uobj; } cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) { - err = -EINVAL; + if (IS_ERR(cq)) { + err = PTR_ERR(cq); goto err_put_pd; } wq_init_attr.cq = cq; wq_init_attr.max_sge = cmd.max_sge; wq_init_attr.max_wr = cmd.max_wr; - wq_init_attr.wq_context = attrs->ufile; wq_init_attr.wq_type = cmd.wq_type; wq_init_attr.event_handler = ib_uverbs_wq_event_handler; wq_init_attr.create_flags = cmd.create_flags; - obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd.user_handle; wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); if (IS_ERR(wq)) { @@ -2927,41 +2957,38 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) goto err_put_cq; } - wq->uobject = &obj->uevent.uobject; + wq->uobject = obj; obj->uevent.uobject.object = wq; wq->wq_type = wq_init_attr.wq_type; wq->cq = cq; wq->pd = pd; wq->device = pd->device; - wq->wq_context = wq_init_attr.wq_context; atomic_set(&wq->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&cq->usecnt); - wq->uobject = &obj->uevent.uobject; - obj->uevent.uobject.object = wq; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + + uobj_put_obj_read(pd); + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); - memset(&resp, 0, sizeof(resp)); resp.wq_handle = obj->uevent.uobject.id; resp.max_sge = wq_init_attr.max_sge; resp.max_wr = wq_init_attr.max_wr; resp.wqn = wq->wq_num; resp.response_length = uverbs_response_length(attrs, sizeof(resp)); - err = uverbs_response(attrs, &resp, sizeof(resp)); - if (err) - goto err_copy; - - uobj_put_obj_read(pd); - uobj_put_obj_read(cq); - return uobj_alloc_commit(&obj->uevent.uobject); + return uverbs_response(attrs, &resp, sizeof(resp)); -err_copy: - ib_destroy_wq(wq); err_put_cq: - uobj_put_obj_read(cq); + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); err_put_pd: uobj_put_obj_read(pd); err_uobj: - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return err; } @@ -3012,18 +3039,36 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs) return -EINVAL; wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs); - if (!wq) - return -EINVAL; + if (IS_ERR(wq)) + return PTR_ERR(wq); - wq_attr.curr_wq_state = cmd.curr_wq_state; - wq_attr.wq_state = cmd.wq_state; if (cmd.attr_mask & IB_WQ_FLAGS) { wq_attr.flags = cmd.flags; wq_attr.flags_mask = cmd.flags_mask; } + + if (cmd.attr_mask & IB_WQ_CUR_STATE) { + if (cmd.curr_wq_state > IB_WQS_ERR) + return -EINVAL; + + wq_attr.curr_wq_state = cmd.curr_wq_state; + } else { + wq_attr.curr_wq_state = wq->state; + } + + if (cmd.attr_mask & IB_WQ_STATE) { + if (cmd.wq_state > IB_WQS_ERR) + return -EINVAL; + + wq_attr.wq_state = cmd.wq_state; + } else { + wq_attr.wq_state = wq_attr.curr_wq_state; + } + ret = wq->device->ops.modify_wq(wq, &wq_attr, cmd.attr_mask, &attrs->driver_udata); - uobj_put_obj_read(wq); + rdma_lookup_put_uobject(&wq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return ret; } @@ -3031,14 +3076,14 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_ex_create_rwq_ind_table cmd; struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; - struct ib_uobject *uobj; + struct ib_uobject *uobj; int err; struct ib_rwq_ind_table_init_attr init_attr = {}; struct ib_rwq_ind_table *rwq_ind_tbl; - struct ib_wq **wqs = NULL; + struct ib_wq **wqs = NULL; u32 *wqs_handles = NULL; struct ib_wq *wq = NULL; - int i, j, num_read_wqs; + int i, num_read_wqs; u32 num_wq_handles; struct uverbs_req_iter iter; struct ib_device *ib_dev; @@ -3078,12 +3123,13 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) num_read_wqs++) { wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, wqs_handles[num_read_wqs], attrs); - if (!wq) { - err = -EINVAL; + if (IS_ERR(wq)) { + err = PTR_ERR(wq); goto put_wqs; } wqs[num_read_wqs] = wq; + atomic_inc(&wqs[num_read_wqs]->usecnt); } uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, attrs, &ib_dev); @@ -3092,17 +3138,15 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) goto put_wqs; } - init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; - init_attr.ind_tbl = wqs; - - rwq_ind_tbl = ib_dev->ops.create_rwq_ind_table(ib_dev, &init_attr, - &attrs->driver_udata); - - if (IS_ERR(rwq_ind_tbl)) { - err = PTR_ERR(rwq_ind_tbl); + rwq_ind_tbl = rdma_zalloc_drv_obj(ib_dev, ib_rwq_ind_table); + if (!rwq_ind_tbl) { + err = -ENOMEM; goto err_uobj; } + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl->ind_tbl = wqs; rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; rwq_ind_tbl->uobject = uobj; @@ -3110,31 +3154,32 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) rwq_ind_tbl->device = ib_dev; atomic_set(&rwq_ind_tbl->usecnt, 0); + err = ib_dev->ops.create_rwq_ind_table(rwq_ind_tbl, &init_attr, + &attrs->driver_udata); + if (err) + goto err_create; + for (i = 0; i < num_wq_handles; i++) - atomic_inc(&wqs[i]->usecnt); + rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + kfree(wqs_handles); + uobj_finalize_uobj_create(uobj, attrs); resp.ind_tbl_handle = uobj->id; resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); - err = uverbs_response(attrs, &resp, sizeof(resp)); - if (err) - goto err_copy; - - kfree(wqs_handles); - - for (j = 0; j < num_read_wqs; j++) - uobj_put_obj_read(wqs[j]); - - return uobj_alloc_commit(uobj); - -err_copy: - ib_destroy_rwq_ind_table(rwq_ind_tbl); +err_create: + kfree(rwq_ind_tbl); err_uobj: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); put_wqs: - for (j = 0; j < num_read_wqs; j++) - uobj_put_obj_read(wqs[j]); + for (i = 0; i < num_read_wqs; i++) { + rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + atomic_dec(&wqs[i]->usecnt); + } err_free: kfree(wqs_handles); kfree(wqs); @@ -3160,7 +3205,7 @@ static int ib_uverbs_ex_destroy_rwq_ind_table(struct uverbs_attr_bundle *attrs) static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) { struct ib_uverbs_create_flow cmd; - struct ib_uverbs_create_flow_resp resp; + struct ib_uverbs_create_flow_resp resp = {}; struct ib_uobject *uobj; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; @@ -3181,7 +3226,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) if (cmd.comp_mask) return -EINVAL; - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) @@ -3228,12 +3273,17 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) goto err_free_attr; } - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { + if (!rdma_is_port_valid(uobj->context->device, cmd.flow_attr.port)) { err = -EINVAL; goto err_uobj; } + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (IS_ERR(qp)) { + err = PTR_ERR(qp); + goto err_uobj; + } + if (qp->qp_type != IB_QPT_UD && qp->qp_type != IB_QPT_RAW_PACKET) { err = -EINVAL; goto err_put; @@ -3277,14 +3327,14 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) ib_spec += ((union ib_flow_spec *) ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { - pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + pr_warn("create flow failed, flow %d: %u bytes left from uverb cmd\n", i, cmd.flow_attr.size); err = -EINVAL; goto err_free; } - flow_id = qp->device->ops.create_flow( - qp, flow_attr, IB_FLOW_DOMAIN_USER, &attrs->driver_udata); + flow_id = qp->device->ops.create_flow(qp, flow_attr, + &attrs->driver_udata); if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); @@ -3293,29 +3343,26 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); - memset(&resp, 0, sizeof(resp)); - resp.flow_handle = uobj->id; - - err = uverbs_response(attrs, &resp, sizeof(resp)); - if (err) - goto err_copy; - - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); kfree(flow_attr); + if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - return uobj_alloc_commit(uobj); -err_copy: - if (!qp->device->ops.destroy_flow(flow_id)) - atomic_dec(&qp->usecnt); + uobj_finalize_uobj_create(uobj, attrs); + + resp.flow_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); + err_free: ib_uverbs_flow_resources_free(uflow_res); err_free_flow_attr: kfree(flow_attr); err_put: - uobj_put_obj_read(qp); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); err_uobj: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); err_free_attr: if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); @@ -3341,13 +3388,13 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, struct ib_uverbs_create_xsrq *cmd, struct ib_udata *udata) { - struct ib_uverbs_create_srq_resp resp; + struct ib_uverbs_create_srq_resp resp = {}; struct ib_usrq_object *obj; struct ib_pd *pd; struct ib_srq *srq; - struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_srq_init_attr attr; int ret; + struct ib_uobject *xrcd_uobj; struct ib_device *ib_dev; obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs, @@ -3379,86 +3426,63 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, if (ib_srq_has_cq(cmd->srq_type)) { attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->cq_handle, attrs); - if (!attr.ext.cq) { - ret = -EINVAL; + if (IS_ERR(attr.ext.cq)) { + ret = PTR_ERR(attr.ext.cq); goto err_put_xrcd; } } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put_cq; } attr.event_handler = ib_uverbs_srq_event_handler; - attr.srq_context = attrs->ufile; attr.srq_type = cmd->srq_type; attr.attr.max_wr = cmd->max_wr; attr.attr.max_sge = cmd->max_sge; attr.attr.srq_limit = cmd->srq_limit; - obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd->user_handle; - srq = pd->device->ops.create_srq(pd, &attr, udata); + srq = ib_create_srq_user(pd, &attr, obj, udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); - goto err_put; - } - - srq->device = pd->device; - srq->pd = pd; - srq->srq_type = cmd->srq_type; - srq->uobject = &obj->uevent.uobject; - srq->event_handler = attr.event_handler; - srq->srq_context = attr.srq_context; - - if (ib_srq_has_cq(cmd->srq_type)) { - srq->ext.cq = attr.ext.cq; - atomic_inc(&attr.ext.cq->usecnt); - } - - if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.xrcd->usecnt); + goto err_put_pd; } - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); - obj->uevent.uobject.object = srq; obj->uevent.uobject.user_handle = cmd->user_handle; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); - memset(&resp, 0, sizeof resp); - resp.srq_handle = obj->uevent.uobject.id; - resp.max_wr = attr.attr.max_wr; - resp.max_sge = attr.attr.max_sge; if (cmd->srq_type == IB_SRQT_XRC) resp.srqn = srq->ext.xrc.srq_num; - ret = uverbs_response(attrs, &resp, sizeof(resp)); - if (ret) - goto err_copy; - if (cmd->srq_type == IB_SRQT_XRC) uobj_put_read(xrcd_uobj); if (ib_srq_has_cq(cmd->srq_type)) - uobj_put_obj_read(attr.ext.cq); + rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); uobj_put_obj_read(pd); - return uobj_alloc_commit(&obj->uevent.uobject); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); -err_copy: - ib_destroy_srq(srq); + resp.srq_handle = obj->uevent.uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + return uverbs_response(attrs, &resp, sizeof(resp)); -err_put: +err_put_pd: uobj_put_obj_read(pd); - err_put_cq: if (ib_srq_has_cq(cmd->srq_type)) - uobj_put_obj_read(attr.ext.cq); + rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { @@ -3467,7 +3491,7 @@ err_put_xrcd: } err: - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return ret; } @@ -3517,8 +3541,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); attr.max_wr = cmd.max_wr; attr.srq_limit = cmd.srq_limit; @@ -3526,7 +3550,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs) ret = srq->device->ops.modify_srq(srq, &attr, cmd.attr_mask, &attrs->driver_udata); - uobj_put_obj_read(srq); + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return ret; } @@ -3544,12 +3569,13 @@ static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); ret = ib_query_srq(srq, &attr); - uobj_put_obj_read(srq); + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); if (ret) return ret; @@ -3618,7 +3644,6 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) copy_query_dev_fields(ucontext, &resp.base, &attr); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING resp.odp_caps.general_caps = attr.odp_caps.general_caps; resp.odp_caps.per_transport_caps.rc_odp_caps = attr.odp_caps.per_transport_caps.rc_odp_caps; @@ -3626,7 +3651,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) attr.odp_caps.per_transport_caps.uc_odp_caps; resp.odp_caps.per_transport_caps.ud_odp_caps = attr.odp_caps.per_transport_caps.ud_odp_caps; -#endif + resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps; resp.timestamp_mask = attr.timestamp_mask; resp.hca_core_clock = attr.hca_core_clock; @@ -3670,13 +3695,13 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) return -EOPNOTSUPP; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period); - uobj_put_obj_read(cq); - + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); return ret; } @@ -3689,13 +3714,10 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) * trailing driver_data flex array. In this case the size of the base struct * cannot be changed. */ -#define offsetof_after(_struct, _member) \ - (offsetof(_struct, _member) + sizeof(((_struct *)NULL)->_member)) - #define UAPI_DEF_WRITE_IO(req, resp) \ .write.has_resp = 1 + \ BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \ - BUILD_BUG_ON_ZERO(sizeof(((req *)0)->response) != \ + BUILD_BUG_ON_ZERO(sizeof_field(req, response) != \ sizeof(u64)), \ .write.req_size = sizeof(req), .write.resp_size = sizeof(resp) @@ -3722,11 +3744,11 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) */ #define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member) \ .write.has_resp = 1, \ - .write.req_size = offsetof_after(req, req_last_member), \ - .write.resp_size = offsetof_after(resp, resp_last_member) + .write.req_size = offsetofend(req, req_last_member), \ + .write.resp_size = offsetofend(resp, resp_last_member) #define UAPI_DEF_WRITE_I_EX(req, req_last_member) \ - .write.req_size = offsetof_after(req, req_last_member) + .write.req_size = offsetofend(req, req_last_member) const struct uapi_definition uverbs_def_write_intf[] = { DECLARE_UVERBS_OBJECT( @@ -3735,13 +3757,13 @@ const struct uapi_definition uverbs_def_write_intf[] = { ib_uverbs_create_ah, UAPI_DEF_WRITE_UDATA_IO( struct ib_uverbs_create_ah, - struct ib_uverbs_create_ah_resp), - UAPI_DEF_METHOD_NEEDS_FN(create_ah)), + struct ib_uverbs_create_ah_resp)), DECLARE_UVERBS_WRITE( IB_USER_VERBS_CMD_DESTROY_AH, ib_uverbs_destroy_ah, - UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah), - UAPI_DEF_METHOD_NEEDS_FN(destroy_ah))), + UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah)), + UAPI_DEF_OBJ_NEEDS_FN(create_user_ah), + UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), DECLARE_UVERBS_OBJECT( UVERBS_OBJECT_COMP_CHANNEL, @@ -3795,7 +3817,7 @@ const struct uapi_definition uverbs_def_write_intf[] = { IB_USER_VERBS_EX_CMD_MODIFY_CQ, ib_uverbs_ex_modify_cq, UAPI_DEF_WRITE_I(struct ib_uverbs_ex_modify_cq), - UAPI_DEF_METHOD_NEEDS_FN(create_cq))), + UAPI_DEF_METHOD_NEEDS_FN(modify_cq))), DECLARE_UVERBS_OBJECT( UVERBS_OBJECT_DEVICE, @@ -4041,8 +4063,7 @@ const struct uapi_definition uverbs_def_write_intf[] = { DECLARE_UVERBS_WRITE( IB_USER_VERBS_CMD_CLOSE_XRCD, ib_uverbs_close_xrcd, - UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd), - UAPI_DEF_METHOD_NEEDS_FN(dealloc_xrcd)), + UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd)), DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_QP, ib_uverbs_open_qp, UAPI_DEF_WRITE_UDATA_IO( @@ -4052,8 +4073,9 @@ const struct uapi_definition uverbs_def_write_intf[] = { ib_uverbs_open_xrcd, UAPI_DEF_WRITE_UDATA_IO( struct ib_uverbs_open_xrcd, - struct ib_uverbs_open_xrcd_resp), - UAPI_DEF_METHOD_NEEDS_FN(alloc_xrcd))), + struct ib_uverbs_open_xrcd_resp)), + UAPI_DEF_OBJ_NEEDS_FN(alloc_xrcd), + UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)), {}, }; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 0ca04d224015..f80da6a67e24 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -36,13 +36,15 @@ #include "uverbs.h" struct bundle_alloc_head { - struct bundle_alloc_head *next; + struct_group_tagged(bundle_alloc_head_hdr, hdr, + struct bundle_alloc_head *next; + ); u8 data[]; }; struct bundle_priv { /* Must be first */ - struct bundle_alloc_head alloc_head; + struct bundle_alloc_head_hdr alloc_head; struct bundle_alloc_head *allocated_mem; size_t internal_avail; size_t internal_used; @@ -58,12 +60,13 @@ struct bundle_priv { DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(uobj_hw_obj_valid, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps * internal_buffer. */ - struct uverbs_attr_bundle bundle; + struct uverbs_attr_bundle_hdr bundle; u64 internal_buffer[32]; }; @@ -76,9 +79,10 @@ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs) { struct bundle_priv *pbundle; + struct uverbs_attr_bundle *bundle; size_t bundle_size = offsetof(struct bundle_priv, internal_buffer) + - sizeof(*pbundle->bundle.attrs) * method_elm->key_bitmap_len + + sizeof(*bundle->attrs) * method_elm->key_bitmap_len + sizeof(*pbundle->uattrs) * num_attrs; method_elm->use_stack = bundle_size <= sizeof(*pbundle); @@ -90,7 +94,7 @@ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, } /** - * uverbs_alloc() - Quickly allocate memory for use with a bundle + * _uverbs_alloc() - Quickly allocate memory for use with a bundle * @bundle: The bundle * @size: Number of bytes to allocate * @flags: Allocator flags @@ -106,7 +110,7 @@ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, gfp_t flags) { struct bundle_priv *pbundle = - container_of(bundle, struct bundle_priv, bundle); + container_of(&bundle->hdr, struct bundle_priv, bundle); size_t new_used; void *res; @@ -127,7 +131,7 @@ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, res = (void *)pbundle->internal_buffer + pbundle->internal_used; pbundle->internal_used = ALIGN(new_used, sizeof(*pbundle->internal_buffer)); - if (flags & __GFP_ZERO) + if (want_init_on_alloc(flags)) memset(res, 0, size); return res; } @@ -136,7 +140,7 @@ EXPORT_SYMBOL(_uverbs_alloc); static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, u16 len) { - if (uattr->len > sizeof(((struct ib_uverbs_attr *)0)->data)) + if (uattr->len > sizeof_field(struct ib_uverbs_attr, data)) return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len, uattr->len - len); @@ -148,7 +152,7 @@ static int uverbs_set_output(const struct uverbs_attr_bundle *bundle, const struct uverbs_attr *attr) { struct bundle_priv *pbundle = - container_of(bundle, struct bundle_priv, bundle); + container_of(&bundle->hdr, struct bundle_priv, bundle); u16 flags; flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags | @@ -165,6 +169,8 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, struct ib_uverbs_attr *uattr, u32 attr_bkey) { + struct uverbs_attr_bundle *bundle = + container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); const struct uverbs_attr_spec *spec = &attr_uapi->spec; size_t array_len; u32 *idr_vals; @@ -183,7 +189,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, return -EINVAL; attr->uobjects = - uverbs_alloc(&pbundle->bundle, + uverbs_alloc(bundle, array_size(array_len, sizeof(*attr->uobjects))); if (IS_ERR(attr->uobjects)) return PTR_ERR(attr->uobjects); @@ -207,8 +213,8 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, for (i = 0; i != array_len; i++) { attr->uobjects[i] = uverbs_get_uobject_from_file( - spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, - spec->u2.objs_arr.access, idr_vals[i]); + spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access, + idr_vals[i], bundle); if (IS_ERR(attr->uobjects[i])) { ret = PTR_ERR(attr->uobjects[i]); break; @@ -220,23 +226,18 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, return ret; } -static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, - struct uverbs_objs_arr_attr *attr, - bool commit) +static void uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + bool commit, + struct uverbs_attr_bundle *attrs) { const struct uverbs_attr_spec *spec = &attr_uapi->spec; - int current_ret; - int ret = 0; size_t i; - for (i = 0; i != attr->len; i++) { - current_ret = uverbs_finalize_object( - attr->uobjects[i], spec->u2.objs_arr.access, commit); - if (!ret) - ret = current_ret; - } - - return ret; + for (i = 0; i != attr->len; i++) + uverbs_finalize_object(attr->uobjects[i], + spec->u2.objs_arr.access, false, commit, + attrs); } static int uverbs_process_attr(struct bundle_priv *pbundle, @@ -244,7 +245,9 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, struct ib_uverbs_attr *uattr, u32 attr_bkey) { const struct uverbs_attr_spec *spec = &attr_uapi->spec; - struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey]; + struct uverbs_attr_bundle *bundle = + container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); + struct uverbs_attr *e = &bundle->attrs[attr_bkey]; const struct uverbs_attr_spec *val_spec = spec; struct uverbs_obj_attr *o_attr; @@ -263,7 +266,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, return -EOPNOTSUPP; e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id; - /* fall through */ + fallthrough; case UVERBS_ATTR_TYPE_PTR_IN: /* Ensure that any data provided by userspace beyond the known * struct is zero. Userspace that knows how to use some future @@ -275,7 +278,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len)) return -EOPNOTSUPP; - /* fall through */ + fallthrough; case UVERBS_ATTR_TYPE_PTR_OUT: if (uattr->len < val_spec->u.ptr.min_len || (!val_spec->zero_trailing && @@ -292,7 +295,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) { void *p; - p = uverbs_alloc(&pbundle->bundle, uattr->len); + p = uverbs_alloc(bundle, uattr->len); if (IS_ERR(p)) return PTR_ERR(p); @@ -324,10 +327,8 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, * IDR implementation today rejects negative IDs */ o_attr->uobject = uverbs_get_uobject_from_file( - spec->u.obj.obj_type, - pbundle->bundle.ufile, - spec->u.obj.access, - uattr->data_s64); + spec->u.obj.obj_type, spec->u.obj.access, + uattr->data_s64, bundle); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); __set_bit(attr_bkey, pbundle->uobj_finalize); @@ -343,6 +344,14 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, break; + case UVERBS_ATTR_TYPE_RAW_FD: + if (uattr->attr_data.reserved || uattr->len != 0 || + uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX) + return -EINVAL; + /* _uverbs_get_const_signed() is the accessor */ + e->ptr_attr.data = uattr->data_s64; + break; + case UVERBS_ATTR_TYPE_IDRS_ARRAY: return uverbs_process_idrs_array(pbundle, attr_uapi, &e->objs_arr_attr, uattr, @@ -420,6 +429,8 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, unsigned int num_attrs) { int (*handler)(struct uverbs_attr_bundle *attrs); + struct uverbs_attr_bundle *bundle = + container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs); unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey; unsigned int i; @@ -432,7 +443,7 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, if (!handler) return -EIO; - pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size); + pbundle->uattrs = uverbs_alloc(bundle, uattrs_size); if (IS_ERR(pbundle->uattrs)) return PTR_ERR(pbundle->uattrs); if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size)) @@ -451,23 +462,23 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, return -EINVAL; if (pbundle->method_elm->has_udata) - uverbs_fill_udata(&pbundle->bundle, - &pbundle->bundle.driver_udata, + uverbs_fill_udata(bundle, &pbundle->bundle.driver_udata, UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); + else + pbundle->bundle.driver_udata = (struct ib_udata){}; if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) { - struct uverbs_obj_attr *destroy_attr = - &pbundle->bundle.attrs[destroy_bkey].obj_attr; + struct uverbs_obj_attr *destroy_attr = &bundle->attrs[destroy_bkey].obj_attr; - ret = uobj_destroy(destroy_attr->uobject); + ret = uobj_destroy(destroy_attr->uobject, bundle); if (ret) return ret; __clear_bit(destroy_bkey, pbundle->uobj_finalize); - ret = handler(&pbundle->bundle); + ret = handler(bundle); uobj_put_destroy(destroy_attr->uobject); } else { - ret = handler(&pbundle->bundle); + ret = handler(bundle); } /* @@ -477,10 +488,10 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, */ if (!ret && pbundle->method_elm->has_udata) { const struct uverbs_attr *attr = - uverbs_attr_get(&pbundle->bundle, UVERBS_ATTR_UHW_OUT); + uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT); if (!IS_ERR(attr)) - ret = uverbs_set_output(&pbundle->bundle, attr); + ret = uverbs_set_output(bundle, attr); } /* @@ -494,34 +505,33 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, return ret; } -static int bundle_destroy(struct bundle_priv *pbundle, bool commit) +static void bundle_destroy(struct bundle_priv *pbundle, bool commit) { unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len; + struct uverbs_attr_bundle *bundle = + container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); struct bundle_alloc_head *memblock; unsigned int i; - int ret = 0; /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { - struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; - int current_ret; + struct uverbs_attr *attr = &bundle->attrs[i]; - current_ret = uverbs_finalize_object( + uverbs_finalize_object( attr->obj_attr.uobject, - attr->obj_attr.attr_elm->spec.u.obj.access, commit); - if (!ret) - ret = current_ret; + attr->obj_attr.attr_elm->spec.u.obj.access, + test_bit(i, pbundle->uobj_hw_obj_valid), + commit, bundle); } i = -1; while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { - struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + struct uverbs_attr *attr = &bundle->attrs[i]; const struct uverbs_api_attr *attr_uapi; void __rcu **slot; - int current_ret; slot = uapi_get_attr_for_method( pbundle, @@ -532,10 +542,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) attr_uapi = rcu_dereference_protected(*slot, true); if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { - current_ret = uverbs_free_idrs_array( - attr_uapi, &attr->objs_arr_attr, commit); - if (!ret) - ret = current_ret; + uverbs_free_idrs_array(attr_uapi, &attr->objs_arr_attr, + commit, bundle); } } @@ -545,8 +553,6 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) memblock = memblock->next; kvfree(tmp); } - - return ret; } static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, @@ -559,7 +565,6 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, struct bundle_priv *pbundle; struct bundle_priv onstack; void __rcu **slot; - int destroy_ret; int ret; if (unlikely(hdr->driver_id != uapi->driver_id)) @@ -581,7 +586,8 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, method_elm->bundle_size - offsetof(struct bundle_priv, internal_buffer); pbundle->alloc_head.next = NULL; - pbundle->allocated_mem = &pbundle->alloc_head; + pbundle->allocated_mem = container_of(&pbundle->alloc_head, + struct bundle_alloc_head, hdr); } else { pbundle = &onstack; pbundle->internal_avail = sizeof(pbundle->internal_buffer); @@ -592,24 +598,25 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, pbundle->method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; + pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ pbundle->radix = &uapi->radix; pbundle->radix_slots = slot; pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); pbundle->user_attrs = user_attrs; pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len * - sizeof(*pbundle->bundle.attrs), - sizeof(*pbundle->internal_buffer)); + sizeof(*container_of(&pbundle->bundle, + struct uverbs_attr_bundle, hdr)->attrs), + sizeof(*pbundle->internal_buffer)); memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); + memset(pbundle->uobj_hw_obj_valid, 0, + sizeof(pbundle->uobj_hw_obj_valid)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); - destroy_ret = bundle_destroy(pbundle, ret == 0); - if (unlikely(destroy_ret && !ret)) - return destroy_ret; - + bundle_destroy(pbundle, ret == 0); return ret; } @@ -703,11 +710,13 @@ void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, unsigned int attr_out) { struct bundle_priv *pbundle = - container_of(bundle, struct bundle_priv, bundle); + container_of(&bundle->hdr, struct bundle_priv, bundle); + struct uverbs_attr_bundle *bundle_aux = + container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); const struct uverbs_attr *in = - uverbs_attr_get(&pbundle->bundle, attr_in); + uverbs_attr_get(bundle_aux, attr_in); const struct uverbs_attr *out = - uverbs_attr_get(&pbundle->bundle, attr_out); + uverbs_attr_get(bundle_aux, attr_out); if (!IS_ERR(in)) { udata->inlen = in->ptr_attr.len; @@ -763,9 +772,10 @@ int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx) return uverbs_set_output(bundle, attr); } -int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, - size_t idx, s64 lower_bound, u64 upper_bound, - s64 *def_val) +int _uverbs_get_const_signed(s64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) { const struct uverbs_attr *attr; @@ -784,13 +794,39 @@ int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, return 0; } -EXPORT_SYMBOL(_uverbs_get_const); +EXPORT_SYMBOL(_uverbs_get_const_signed); + +int _uverbs_get_const_unsigned(u64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 upper_bound, u64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to > upper_bound) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const_unsigned); int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); + if (IS_ERR(attr)) + return PTR_ERR(attr); + if (size < attr->ptr_attr.len) { if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size, attr->ptr_attr.len - size)) @@ -798,3 +834,16 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, } return uverbs_copy_to(bundle, idx, from, size); } +EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero); + +/* Once called an abort will call through to the type's destroy_hw() */ +void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, + u16 idx) +{ + struct bundle_priv *pbundle = + container_of(&bundle->hdr, struct bundle_priv, bundle); + + __set_bit(uapi_bkey_attr(uapi_key_attr(idx)), + pbundle->uobj_hw_obj_valid); +} +EXPORT_SYMBOL(uverbs_finalize_uobj_create); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2890a77339e1..973fe2c7ef53 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -51,6 +51,8 @@ #include <rdma/ib.h> #include <rdma/uverbs_std_types.h> +#include <rdma/rdma_netlink.h> +#include <rdma/ib_ucaps.h> #include "uverbs.h" #include "core_priv.h" @@ -71,11 +73,23 @@ enum { #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) static dev_t dynamic_uverbs_dev; -static struct class *uverbs_class; static DEFINE_IDA(uverbs_ida); -static void ib_uverbs_add_one(struct ib_device *device); +static int ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); +static struct ib_client uverbs_client; + +static char *uverbs_devnode(const struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static const struct class uverbs_class = { + .name = "infiniband_verbs", + .devnode = uverbs_devnode, +}; /* * Must be called with the ufile->device->disassociate_srcu held, and the lock @@ -107,8 +121,11 @@ int uverbs_dealloc_mw(struct ib_mw *mw) int ret; ret = mw->device->ops.dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + kfree(mw); return ret; } @@ -119,20 +136,13 @@ static void ib_uverbs_release_dev(struct device *device) uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); + mutex_destroy(&dev->lists_mutex); + mutex_destroy(&dev->xrcd_tree_mutex); kfree(dev); } -static void ib_uverbs_release_async_event_file(struct kref *ref) -{ - struct ib_uverbs_async_event_file *file = - container_of(ref, struct ib_uverbs_async_event_file, ref); - - kfree(file); -} - -void ib_uverbs_release_ucq(struct ib_uverbs_file *file, - struct ib_uverbs_completion_event_file *ev_file, - struct ib_ucq_object *uobj) +void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, + struct ib_ucq_object *uobj) { struct ib_uverbs_event *evt, *tmp; @@ -147,25 +157,24 @@ void ib_uverbs_release_ucq(struct ib_uverbs_file *file, uverbs_uobject_put(&ev_file->uobj); } - spin_lock_irq(&file->async_file->ev_queue.lock); - list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) { - list_del(&evt->list); - kfree(evt); - } - spin_unlock_irq(&file->async_file->ev_queue.lock); + ib_uverbs_release_uevent(&uobj->uevent); } -void ib_uverbs_release_uevent(struct ib_uverbs_file *file, - struct ib_uevent_object *uobj) +void ib_uverbs_release_uevent(struct ib_uevent_object *uobj) { + struct ib_uverbs_async_event_file *async_file = uobj->event_file; struct ib_uverbs_event *evt, *tmp; - spin_lock_irq(&file->async_file->ev_queue.lock); + if (!async_file) + return; + + spin_lock_irq(&async_file->ev_queue.lock); list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) { list_del(&evt->list); kfree(evt); } - spin_unlock_irq(&file->async_file->ev_queue.lock); + spin_unlock_irq(&async_file->ev_queue.lock); + uverbs_uobject_put(&async_file->uobj); } void ib_uverbs_detach_umcast(struct ib_qp *qp, @@ -198,18 +207,25 @@ void ib_uverbs_release_file(struct kref *ref) ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); if (ib_dev && !ib_dev->ops.disassociate_ucontext) - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); - if (atomic_dec_and_test(&file->device->refcount)) + if (refcount_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); + if (file->default_async_file) + uverbs_uobject_put(&file->default_async_file->uobj); put_device(&file->device->dev); + + if (file->disassociate_page) + __free_pages(file->disassociate_page, 0); + mutex_destroy(&file->disassociation_lock); + mutex_destroy(&file->umap_lock); + mutex_destroy(&file->ucontext_lock); kfree(file); } static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, - struct ib_uverbs_file *uverbs_file, struct file *filp, char __user *buf, size_t count, loff_t *pos, size_t eventsz) @@ -220,25 +236,20 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, spin_lock_irq(&ev_queue->lock); while (list_empty(&ev_queue->event_list)) { - spin_unlock_irq(&ev_queue->lock); + if (ev_queue->is_closed) { + spin_unlock_irq(&ev_queue->lock); + return -EIO; + } + spin_unlock_irq(&ev_queue->lock); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(ev_queue->poll_wait, (!list_empty(&ev_queue->event_list) || - /* The barriers built into wait_event_interruptible() - * and wake_up() guarentee this will see the null set - * without using RCU - */ - !uverbs_file->device->ib_dev))) + ev_queue->is_closed))) return -ERESTARTSYS; - /* If device was disassociated and no event exists set an error */ - if (list_empty(&ev_queue->event_list) && - !uverbs_file->device->ib_dev) - return -EIO; - spin_lock_irq(&ev_queue->lock); } @@ -274,8 +285,7 @@ static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf, { struct ib_uverbs_async_event_file *file = filp->private_data; - return ib_uverbs_event_read(&file->ev_queue, file->uverbs_file, filp, - buf, count, pos, + return ib_uverbs_event_read(&file->ev_queue, filp, buf, count, pos, sizeof(struct ib_uverbs_async_event_desc)); } @@ -285,9 +295,8 @@ static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf, struct ib_uverbs_completion_event_file *comp_ev_file = filp->private_data; - return ib_uverbs_event_read(&comp_ev_file->ev_queue, - comp_ev_file->uobj.ufile, filp, - buf, count, pos, + return ib_uverbs_event_read(&comp_ev_file->ev_queue, filp, buf, count, + pos, sizeof(struct ib_uverbs_comp_event_desc)); } @@ -302,6 +311,8 @@ static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue, spin_lock_irq(&ev_queue->lock); if (!list_empty(&ev_queue->event_list)) pollflags = EPOLLIN | EPOLLRDNORM; + else if (ev_queue->is_closed) + pollflags = EPOLLERR; spin_unlock_irq(&ev_queue->lock); return pollflags; @@ -310,7 +321,9 @@ static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue, static __poll_t ib_uverbs_async_event_poll(struct file *filp, struct poll_table_struct *wait) { - return ib_uverbs_event_poll(filp->private_data, filp, wait); + struct ib_uverbs_async_event_file *file = filp->private_data; + + return ib_uverbs_event_poll(&file->ev_queue, filp, wait); } static __poll_t ib_uverbs_comp_event_poll(struct file *filp, @@ -324,9 +337,9 @@ static __poll_t ib_uverbs_comp_event_poll(struct file *filp, static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on) { - struct ib_uverbs_event_queue *ev_queue = filp->private_data; + struct ib_uverbs_async_event_file *file = filp->private_data; - return fasync_helper(fd, filp, on, &ev_queue->async_queue); + return fasync_helper(fd, filp, on, &file->ev_queue.async_queue); } static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on) @@ -337,72 +350,20 @@ static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on) return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue); } -static int ib_uverbs_async_event_close(struct inode *inode, struct file *filp) -{ - struct ib_uverbs_async_event_file *file = filp->private_data; - struct ib_uverbs_file *uverbs_file = file->uverbs_file; - struct ib_uverbs_event *entry, *tmp; - int closed_already = 0; - - mutex_lock(&uverbs_file->device->lists_mutex); - spin_lock_irq(&file->ev_queue.lock); - closed_already = file->ev_queue.is_closed; - file->ev_queue.is_closed = 1; - list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) { - if (entry->counter) - list_del(&entry->obj_list); - kfree(entry); - } - spin_unlock_irq(&file->ev_queue.lock); - if (!closed_already) { - list_del(&file->list); - ib_unregister_event_handler(&uverbs_file->event_handler); - } - mutex_unlock(&uverbs_file->device->lists_mutex); - - kref_put(&uverbs_file->ref, ib_uverbs_release_file); - kref_put(&file->ref, ib_uverbs_release_async_event_file); - - return 0; -} - -static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp) -{ - struct ib_uobject *uobj = filp->private_data; - struct ib_uverbs_completion_event_file *file = container_of( - uobj, struct ib_uverbs_completion_event_file, uobj); - struct ib_uverbs_event *entry, *tmp; - - spin_lock_irq(&file->ev_queue.lock); - list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) { - if (entry->counter) - list_del(&entry->obj_list); - kfree(entry); - } - file->ev_queue.is_closed = 1; - spin_unlock_irq(&file->ev_queue.lock); - - uverbs_close_fd(filp); - - return 0; -} - const struct file_operations uverbs_event_fops = { .owner = THIS_MODULE, .read = ib_uverbs_comp_event_read, .poll = ib_uverbs_comp_event_poll, - .release = ib_uverbs_comp_event_close, + .release = uverbs_uobject_fd_release, .fasync = ib_uverbs_comp_event_fasync, - .llseek = no_llseek, }; -static const struct file_operations uverbs_async_event_fops = { +const struct file_operations uverbs_async_event_fops = { .owner = THIS_MODULE, .read = ib_uverbs_async_event_read, .poll = ib_uverbs_async_event_poll, - .release = ib_uverbs_async_event_close, + .release = uverbs_async_event_release, .fasync = ib_uverbs_async_event_fasync, - .llseek = no_llseek, }; void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) @@ -427,9 +388,9 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) return; } - uobj = container_of(cq->uobject, struct ib_ucq_object, uobject); + uobj = cq->uobject; - entry->desc.comp.cq_handle = cq->uobject->user_handle; + entry->desc.comp.cq_handle = cq->uobject->uevent.uobject.user_handle; entry->counter = &uobj->comp_events_reported; list_add_tail(&entry->list, &ev_queue->event_list); @@ -440,102 +401,81 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN); } -static void ib_uverbs_async_handler(struct ib_uverbs_file *file, - __u64 element, __u64 event, - struct list_head *obj_list, - u32 *counter) +void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, + __u64 element, __u64 event, + struct list_head *obj_list, u32 *counter) { struct ib_uverbs_event *entry; unsigned long flags; - spin_lock_irqsave(&file->async_file->ev_queue.lock, flags); - if (file->async_file->ev_queue.is_closed) { - spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags); + if (!async_file) + return; + + spin_lock_irqsave(&async_file->ev_queue.lock, flags); + if (async_file->ev_queue.is_closed) { + spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); return; } entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) { - spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags); + spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); return; } - entry->desc.async.element = element; + entry->desc.async.element = element; entry->desc.async.event_type = event; - entry->desc.async.reserved = 0; - entry->counter = counter; + entry->desc.async.reserved = 0; + entry->counter = counter; - list_add_tail(&entry->list, &file->async_file->ev_queue.event_list); + list_add_tail(&entry->list, &async_file->ev_queue.event_list); if (obj_list) list_add_tail(&entry->obj_list, obj_list); - spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags); + spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); - wake_up_interruptible(&file->async_file->ev_queue.poll_wait); - kill_fasync(&file->async_file->ev_queue.async_queue, SIGIO, POLL_IN); + wake_up_interruptible(&async_file->ev_queue.poll_wait); + kill_fasync(&async_file->ev_queue.async_queue, SIGIO, POLL_IN); } -void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) +static void uverbs_uobj_event(struct ib_uevent_object *eobj, + struct ib_event *event) { - struct ib_ucq_object *uobj = container_of(event->element.cq->uobject, - struct ib_ucq_object, uobject); + ib_uverbs_async_handler(eobj->event_file, + eobj->uobject.user_handle, event->event, + &eobj->event_list, &eobj->events_reported); +} - ib_uverbs_async_handler(uobj->uobject.ufile, uobj->uobject.user_handle, - event->event, &uobj->async_list, - &uobj->async_events_reported); +void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) +{ + uverbs_uobj_event(&event->element.cq->uobject->uevent, event); } void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) { - struct ib_uevent_object *uobj; - /* for XRC target qp's, check that qp is live */ if (!event->element.qp->uobject) return; - uobj = container_of(event->element.qp->uobject, - struct ib_uevent_object, uobject); - - ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, - event->event, &uobj->event_list, - &uobj->events_reported); + uverbs_uobj_event(&event->element.qp->uobject->uevent, event); } void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr) { - struct ib_uevent_object *uobj = container_of(event->element.wq->uobject, - struct ib_uevent_object, uobject); - - ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, - event->event, &uobj->event_list, - &uobj->events_reported); + uverbs_uobj_event(&event->element.wq->uobject->uevent, event); } void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) { - struct ib_uevent_object *uobj; - - uobj = container_of(event->element.srq->uobject, - struct ib_uevent_object, uobject); - - ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, - event->event, &uobj->event_list, - &uobj->events_reported); + uverbs_uobj_event(&event->element.srq->uobject->uevent, event); } -void ib_uverbs_event_handler(struct ib_event_handler *handler, - struct ib_event *event) +static void ib_uverbs_event_handler(struct ib_event_handler *handler, + struct ib_event *event) { - struct ib_uverbs_file *file = - container_of(handler, struct ib_uverbs_file, event_handler); - - ib_uverbs_async_handler(file, event->element.port_num, event->event, - NULL, NULL); -} - -void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file) -{ - kref_put(&file->async_file->ref, ib_uverbs_release_async_event_file); - file->async_file = NULL; + ib_uverbs_async_handler( + container_of(handler, struct ib_uverbs_async_event_file, + event_handler), + event->element.port_num, event->event, NULL, NULL); } void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue) @@ -547,45 +487,26 @@ void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue) ev_queue->async_queue = NULL; } -struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, - struct ib_device *ib_dev) +void ib_uverbs_init_async_event_file( + struct ib_uverbs_async_event_file *async_file) { - struct ib_uverbs_async_event_file *ev_file; - struct file *filp; - - ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); - if (!ev_file) - return ERR_PTR(-ENOMEM); - - ib_uverbs_init_event_queue(&ev_file->ev_queue); - ev_file->uverbs_file = uverbs_file; - kref_get(&ev_file->uverbs_file->ref); - kref_init(&ev_file->ref); - filp = anon_inode_getfile("[infinibandevent]", &uverbs_async_event_fops, - ev_file, O_RDONLY); - if (IS_ERR(filp)) - goto err_put_refs; - - mutex_lock(&uverbs_file->device->lists_mutex); - list_add_tail(&ev_file->list, - &uverbs_file->device->uverbs_events_file_list); - mutex_unlock(&uverbs_file->device->lists_mutex); - - WARN_ON(uverbs_file->async_file); - uverbs_file->async_file = ev_file; - kref_get(&uverbs_file->async_file->ref); - INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, - ib_dev, - ib_uverbs_event_handler); - ib_register_event_handler(&uverbs_file->event_handler); - /* At that point async file stuff was fully set */ + struct ib_uverbs_file *uverbs_file = async_file->uobj.ufile; + struct ib_device *ib_dev = async_file->uobj.context->device; - return filp; + ib_uverbs_init_event_queue(&async_file->ev_queue); -err_put_refs: - kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); - kref_put(&ev_file->ref, ib_uverbs_release_async_event_file); - return filp; + /* The first async_event_file becomes the default one for the file. */ + mutex_lock(&uverbs_file->ucontext_lock); + if (!uverbs_file->default_async_file) { + /* Pairs with the put in ib_uverbs_release_file */ + uverbs_uobject_get(&async_file->uobj); + smp_store_release(&uverbs_file->default_async_file, async_file); + } + mutex_unlock(&uverbs_file->ucontext_lock); + + INIT_IB_EVENT_HANDLER(&async_file->event_handler, ib_dev, + ib_uverbs_event_handler); + ib_register_event_handler(&async_file->event_handler); } static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr, @@ -626,7 +547,7 @@ static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr, if (hdr->in_words * 4 != count) return -EINVAL; - if (count < method_elm->req_size + sizeof(hdr)) { + if (count < method_elm->req_size + sizeof(*hdr)) { /* * rdma-core v18 and v19 have a bug where they send DESTROY_CQ * with a 16 byte write instead of 24. Old kernels didn't @@ -692,6 +613,8 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, memset(bundle.attr_present, 0, sizeof(bundle.attr_present)); bundle.ufile = file; + bundle.context = NULL; /* only valid if bundle has uobject */ + bundle.uobject = NULL; if (!method_elm->is_ex) { size_t in_len = hdr.in_words * 4 - sizeof(hdr); size_t out_len = hdr.out_words * 4; @@ -716,7 +639,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, * then the command request structure starts * with a '__aligned u64 response' member. */ - ret = get_user(response, (const u64 *)buf); + ret = get_user(response, (const u64 __user *)buf); if (ret) goto out_unlock; @@ -755,11 +678,16 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, } ret = method_elm->handler(&bundle); + if (bundle.uobject) + uverbs_finalize_object(bundle.uobject, UVERBS_ACCESS_NEW, true, + !ret, &bundle); out_unlock: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return (ret) ? : count; } +static const struct vm_operations_struct rdma_umap_ops; + static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) { struct ib_uverbs_file *file = filp->private_data; @@ -774,45 +702,18 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) goto out; } + mutex_lock(&file->disassociation_lock); + + vma->vm_ops = &rdma_umap_ops; ret = ucontext->device->ops.mmap(ucontext, vma); + + mutex_unlock(&file->disassociation_lock); out: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; } /* - * Each time we map IO memory into user space this keeps track of the mapping. - * When the device is hot-unplugged we 'zap' the mmaps in user space to point - * to the zero page and allow the hot unplug to proceed. - * - * This is necessary for cases like PCI physical hot unplug as the actual BAR - * memory may vanish after this and access to it from userspace could MCE. - * - * RDMA drivers supporting disassociation must have their user space designed - * to cope in some way with their IO pages going to the zero page. - */ -struct rdma_umap_priv { - struct vm_area_struct *vma; - struct list_head list; -}; - -static const struct vm_operations_struct rdma_umap_ops; - -static void rdma_umap_priv_init(struct rdma_umap_priv *priv, - struct vm_area_struct *vma) -{ - struct ib_uverbs_file *ufile = vma->vm_file->private_data; - - priv->vma = vma; - vma->vm_private_data = priv; - vma->vm_ops = &rdma_umap_ops; - - mutex_lock(&ufile->umap_lock); - list_add(&priv->list, &ufile->umaps); - mutex_unlock(&ufile->umap_lock); -} - -/* * The VMA has been dup'd, initialize the vm_private_data with a new tracking * struct */ @@ -828,6 +729,8 @@ static void rdma_umap_open(struct vm_area_struct *vma) /* We are racing with disassociation */ if (!down_read_trylock(&ufile->hw_destroy_rwsem)) goto out_zap; + mutex_lock(&ufile->disassociation_lock); + /* * Disassociation already completed, the VMA should already be zapped. */ @@ -837,12 +740,14 @@ static void rdma_umap_open(struct vm_area_struct *vma) priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) goto out_unlock; - rdma_umap_priv_init(priv, vma); + rdma_umap_priv_init(priv, vma, opriv->entry); + mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); return; out_unlock: + mutex_unlock(&ufile->disassociation_lock); up_read(&ufile->hw_destroy_rwsem); out_zap: /* @@ -868,119 +773,102 @@ static void rdma_umap_close(struct vm_area_struct *vma) * this point. */ mutex_lock(&ufile->umap_lock); + if (priv->entry) + rdma_user_mmap_entry_put(priv->entry); + list_del(&priv->list); mutex_unlock(&ufile->umap_lock); kfree(priv); } -static const struct vm_operations_struct rdma_umap_ops = { - .open = rdma_umap_open, - .close = rdma_umap_close, -}; - -static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, - unsigned long size) -{ - struct ib_uverbs_file *ufile = ucontext->ufile; - struct rdma_umap_priv *priv; - - if (vma->vm_end - vma->vm_start != size) - return ERR_PTR(-EINVAL); - - /* Driver is using this wrong, must be called by ib_uverbs_mmap */ - if (WARN_ON(!vma->vm_file || - vma->vm_file->private_data != ufile)) - return ERR_PTR(-EINVAL); - lockdep_assert_held(&ufile->device->disassociate_srcu); - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) - return ERR_PTR(-ENOMEM); - return priv; -} - /* - * Map IO memory into a process. This is to be called by drivers as part of - * their mmap() functions if they wish to send something like PCI-E BAR memory - * to userspace. + * Once the zap_vma_ptes has been called touches to the VMA will come here and + * we return a dummy writable zero page for all the pfns. */ -int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, - unsigned long pfn, unsigned long size, pgprot_t prot) +static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) { - struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data; + struct rdma_umap_priv *priv = vmf->vma->vm_private_data; + vm_fault_t ret = 0; - if (IS_ERR(priv)) - return PTR_ERR(priv); + if (!priv) + return VM_FAULT_SIGBUS; - vma->vm_page_prot = prot; - if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { - kfree(priv); - return -EAGAIN; + /* Read only pages can just use the system zero page. */ + if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { + vmf->page = ZERO_PAGE(vmf->address); + get_page(vmf->page); + return 0; } - rdma_umap_priv_init(priv, vma); - return 0; -} -EXPORT_SYMBOL(rdma_user_mmap_io); - -/* - * The page case is here for a slightly different reason, the driver expects - * to be able to free the page it is sharing to user space when it destroys - * its ucontext, which means we need to zap the user space references. - * - * We could handle this differently by providing an API to allocate a shared - * page and then only freeing the shared page when the last ufile is - * destroyed. - */ -int rdma_user_mmap_page(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, struct page *page, - unsigned long size) -{ - struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); - - if (IS_ERR(priv)) - return PTR_ERR(priv); + mutex_lock(&ufile->umap_lock); + if (!ufile->disassociate_page) + ufile->disassociate_page = + alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0); - if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, - vma->vm_page_prot)) { - kfree(priv); - return -EAGAIN; + if (ufile->disassociate_page) { + /* + * This VMA is forced to always be shared so this doesn't have + * to worry about COW. + */ + vmf->page = ufile->disassociate_page; + get_page(vmf->page); + } else { + ret = VM_FAULT_SIGBUS; } + mutex_unlock(&ufile->umap_lock); - rdma_umap_priv_init(priv, vma); - return 0; + return ret; } -EXPORT_SYMBOL(rdma_user_mmap_page); + +static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, + .fault = rdma_umap_fault, +}; void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) { struct rdma_umap_priv *priv, *next_priv; - lockdep_assert_held(&ufile->hw_destroy_rwsem); + mutex_lock(&ufile->disassociation_lock); while (1) { struct mm_struct *mm = NULL; /* Get an arbitrary mm pointer that hasn't been cleaned yet */ mutex_lock(&ufile->umap_lock); - if (!list_empty(&ufile->umaps)) { - mm = list_first_entry(&ufile->umaps, - struct rdma_umap_priv, list) - ->vma->vm_mm; - mmget(mm); + while (!list_empty(&ufile->umaps)) { + int ret; + + priv = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list); + mm = priv->vma->vm_mm; + ret = mmget_not_zero(mm); + if (!ret) { + list_del_init(&priv->list); + if (priv->entry) { + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } + mm = NULL; + continue; + } + break; } mutex_unlock(&ufile->umap_lock); - if (!mm) + if (!mm) { + mutex_unlock(&ufile->disassociation_lock); return; + } /* - * The umap_lock is nested under mmap_sem since it used within + * The umap_lock is nested under mmap_lock since it used within * the vma_ops callbacks, so we have to clean the list one mm * at a time to get the lock ordering right. Typically there * will only be one mm, so no big deal. */ - down_write(&mm->mmap_sem); + mmap_read_lock(mm); mutex_lock(&ufile->umap_lock); list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list) { @@ -992,14 +880,42 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + + if (priv->entry) { + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } } mutex_unlock(&ufile->umap_lock); - up_write(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); } + + mutex_unlock(&ufile->disassociation_lock); } +/** + * rdma_user_mmap_disassociate() - Revoke mmaps for a device + * @device: device to revoke + * + * This function should be called by drivers that need to disable mmaps for the + * device, for instance because it is going to be reset. + */ +void rdma_user_mmap_disassociate(struct ib_device *device) +{ + struct ib_uverbs_device *uverbs_dev = + ib_get_client_data(device, &uverbs_client); + struct ib_uverbs_file *ufile; + + mutex_lock(&uverbs_dev->lists_mutex); + list_for_each_entry(ufile, &uverbs_dev->uverbs_file_list, list) { + if (ufile->ucontext) + uverbs_user_mmap_disassociate(ufile); + } + mutex_unlock(&uverbs_dev->lists_mutex); +} +EXPORT_SYMBOL(rdma_user_mmap_disassociate); + /* * ib_uverbs_open() does not need the BKL: * @@ -1020,7 +936,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) int srcu_key; dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); - if (!atomic_inc_not_zero(&dev->refcount)) + if (!refcount_inc_not_zero(&dev->refcount)) return -ENXIO; get_device(&dev->dev); @@ -1033,13 +949,18 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) goto err; } + if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto err; + } + /* In case IB device supports disassociate ucontext, there is no hard * dependency between uverbs device and its low level device. */ module_dependent = !(ib_dev->ops.disassociate_ucontext); if (module_dependent) { - if (!try_module_get(ib_dev->owner)) { + if (!try_module_get(ib_dev->ops.owner)) { ret = -ENODEV; goto err; } @@ -1064,6 +985,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) mutex_init(&file->umap_lock); INIT_LIST_HEAD(&file->umaps); + mutex_init(&file->disassociation_lock); + filp->private_data = file; list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); @@ -1071,15 +994,15 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) setup_ufile_idr_uobject(file); - return nonseekable_open(inode, filp); + return stream_open(inode, filp); err_module: - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); err: mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); - if (atomic_dec_and_test(&dev->refcount)) + if (refcount_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); put_device(&dev->dev); @@ -1096,10 +1019,6 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) list_del_init(&file->list); mutex_unlock(&file->device->lists_mutex); - if (file->async_file) - kref_put(&file->async_file->ref, - ib_uverbs_release_async_event_file); - kref_put(&file->ref, ib_uverbs_release_file); return 0; @@ -1110,9 +1029,8 @@ static const struct file_operations uverbs_fops = { .write = ib_uverbs_write, .open = ib_uverbs_open, .release = ib_uverbs_close, - .llseek = no_llseek, .unlocked_ioctl = ib_uverbs_ioctl, - .compat_ioctl = ib_uverbs_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; static const struct file_operations uverbs_mmap_fops = { @@ -1121,16 +1039,45 @@ static const struct file_operations uverbs_mmap_fops = { .mmap = ib_uverbs_mmap, .open = ib_uverbs_open, .release = ib_uverbs_close, - .llseek = no_llseek, .unlocked_ioctl = ib_uverbs_ioctl, - .compat_ioctl = ib_uverbs_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; +static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int ret; + + if (res->port != -1) + return -EINVAL; + + res->abi = ibdev->ops.uverbs_abi_ver; + res->cdev = &uverbs_dev->dev; + + /* + * To support DRIVER_ID binding in userspace some of the driver need + * upgrading to expose their PCI dependent revision information + * through get_context instead of relying on modalias matching. When + * the drivers are fixed they can drop this flag. + */ + if (!ibdev->ops.uverbs_no_driver_id_binding) { + ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, + ibdev->ops.driver_id); + if (ret) + return ret; + } + return 0; +} + static struct ib_client uverbs_client = { .name = "uverbs", + .no_kverbs_req = true, .add = ib_uverbs_add_one, - .remove = ib_uverbs_remove_one + .remove = ib_uverbs_remove_one, + .get_nl_info = ib_uverbs_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("uverbs"); static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) @@ -1144,7 +1091,7 @@ static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev)); + ret = sysfs_emit(buf, "%s\n", dev_name(&ib_dev->dev)); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; @@ -1163,7 +1110,7 @@ static ssize_t abi_version_show(struct device *device, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); + ret = sysfs_emit(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; @@ -1196,53 +1143,56 @@ static int ib_uverbs_create_uapi(struct ib_device *device, return 0; } -static void ib_uverbs_add_one(struct ib_device *device) +static int ib_uverbs_add_one(struct ib_device *device) { int devnum; dev_t base; struct ib_uverbs_device *uverbs_dev; int ret; - if (!device->ops.alloc_ucontext) - return; + if (!device->ops.alloc_ucontext || + device->type == RDMA_DEVICE_TYPE_SMI) + return -EOPNOTSUPP; uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); if (!uverbs_dev) - return; + return -ENOMEM; ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); if (ret) { kfree(uverbs_dev); - return; + return -ENOMEM; } device_initialize(&uverbs_dev->dev); - uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.class = &uverbs_class; uverbs_dev->dev.parent = device->dev.parent; uverbs_dev->dev.release = ib_uverbs_release_dev; uverbs_dev->groups[0] = &dev_attr_group; uverbs_dev->dev.groups = uverbs_dev->groups; - atomic_set(&uverbs_dev->refcount, 1); + refcount_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); - INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, GFP_KERNEL); - if (devnum < 0) + if (devnum < 0) { + ret = -ENOMEM; goto err; + } uverbs_dev->devnum = devnum; if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else base = IB_UVERBS_BASE_DEV + devnum; - if (ib_uverbs_create_uapi(device, uverbs_dev)) + ret = ib_uverbs_create_uapi(device, uverbs_dev); + if (ret) goto err_uapi; uverbs_dev->dev.devt = base; @@ -1257,30 +1207,25 @@ static void ib_uverbs_add_one(struct ib_device *device) goto err_uapi; ib_set_client_data(device, &uverbs_client, uverbs_dev); - return; + return 0; err_uapi: ida_free(&uverbs_ida, devnum); err: - if (atomic_dec_and_test(&uverbs_dev->refcount)) + if (refcount_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); put_device(&uverbs_dev->dev); - return; + return ret; } static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, struct ib_device *ib_dev) { struct ib_uverbs_file *file; - struct ib_uverbs_async_event_file *event_file; - struct ib_event event; /* Pending running commands to terminate */ uverbs_disassociate_api_pre(uverbs_dev); - event.event = IB_EVENT_DEVICE_FATAL; - event.element.port_num = 0; - event.device = ib_dev; mutex_lock(&uverbs_dev->lists_mutex); while (!list_empty(&uverbs_dev->uverbs_file_list)) { @@ -1296,31 +1241,11 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, */ mutex_unlock(&uverbs_dev->lists_mutex); - ib_uverbs_event_handler(&file->event_handler, &event); uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE); kref_put(&file->ref, ib_uverbs_release_file); mutex_lock(&uverbs_dev->lists_mutex); } - - while (!list_empty(&uverbs_dev->uverbs_events_file_list)) { - event_file = list_first_entry(&uverbs_dev-> - uverbs_events_file_list, - struct ib_uverbs_async_event_file, - list); - spin_lock_irq(&event_file->ev_queue.lock); - event_file->ev_queue.is_closed = 1; - spin_unlock_irq(&event_file->ev_queue.lock); - - list_del(&event_file->list); - ib_unregister_event_handler( - &event_file->uverbs_file->event_handler); - event_file->uverbs_file->event_handler.device = - NULL; - - wake_up_interruptible(&event_file->ev_queue.poll_wait); - kill_fasync(&event_file->ev_queue.async_queue, SIGIO, POLL_IN); - } mutex_unlock(&uverbs_dev->lists_mutex); uverbs_disassociate_api(uverbs_dev->uapi); @@ -1331,9 +1256,6 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) struct ib_uverbs_device *uverbs_dev = client_data; int wait_clients = 1; - if (!uverbs_dev) - return; - cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); ida_free(&uverbs_ida, uverbs_dev->devnum); @@ -1353,7 +1275,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) wait_clients = 0; } - if (atomic_dec_and_test(&uverbs_dev->refcount)) + if (refcount_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); @@ -1361,13 +1283,6 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) put_device(&uverbs_dev->dev); } -static char *uverbs_devnode(struct device *dev, umode_t *mode) -{ - if (mode) - *mode = 0666; - return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); -} - static int __init ib_uverbs_init(void) { int ret; @@ -1388,16 +1303,13 @@ static int __init ib_uverbs_init(void) goto out_alloc; } - uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); - if (IS_ERR(uverbs_class)) { - ret = PTR_ERR(uverbs_class); + ret = class_register(&uverbs_class); + if (ret) { pr_err("user_verbs: couldn't create class infiniband_verbs\n"); goto out_chrdev; } - uverbs_class->devnode = uverbs_devnode; - - ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); + ret = class_create_file(&uverbs_class, &class_attr_abi_version.attr); if (ret) { pr_err("user_verbs: couldn't create abi_version attribute\n"); goto out_class; @@ -1412,7 +1324,7 @@ static int __init ib_uverbs_init(void) return 0; out_class: - class_destroy(uverbs_class); + class_unregister(&uverbs_class); out_chrdev: unregister_chrdev_region(dynamic_uverbs_dev, @@ -1429,11 +1341,13 @@ out: static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); - class_destroy(uverbs_class); + class_unregister(&uverbs_class); unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); + ib_cleanup_ucaps(); + mmu_notifier_synchronize(); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index b8d715c68ca4..e803f609ec87 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -66,7 +66,7 @@ void ib_copy_ah_attr_to_user(struct ib_device *device, struct rdma_ah_attr *src = ah_attr; struct rdma_ah_attr conv_ah; - memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + memset(&dst->grh, 0, sizeof(dst->grh)); if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && (rdma_ah_get_dlid(ah_attr) > be16_to_cpu(IB_LID_PERMISSIVE)) && @@ -171,45 +171,3 @@ void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, __ib_copy_path_rec_to_user(dst, src); } EXPORT_SYMBOL(ib_copy_path_rec_to_user); - -void ib_copy_path_rec_from_user(struct sa_path_rec *dst, - struct ib_user_path_rec *src) -{ - u32 slid, dlid; - - memset(dst, 0, sizeof(*dst)); - if ((ib_is_opa_gid((union ib_gid *)src->sgid)) || - (ib_is_opa_gid((union ib_gid *)src->dgid))) { - dst->rec_type = SA_PATH_REC_TYPE_OPA; - slid = opa_get_lid_from_gid((union ib_gid *)src->sgid); - dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid); - } else { - dst->rec_type = SA_PATH_REC_TYPE_IB; - slid = ntohs(src->slid); - dlid = ntohs(src->dlid); - } - memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); - memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); - - sa_path_set_dlid(dst, dlid); - sa_path_set_slid(dst, slid); - sa_path_set_raw_traffic(dst, src->raw_traffic); - dst->flow_label = src->flow_label; - dst->hop_limit = src->hop_limit; - dst->traffic_class = src->traffic_class; - dst->reversible = src->reversible; - dst->numb_path = src->numb_path; - dst->pkey = src->pkey; - dst->sl = src->sl; - dst->mtu_selector = src->mtu_selector; - dst->mtu = src->mtu; - dst->rate_selector = src->rate_selector; - dst->rate = src->rate; - dst->packet_life_time = src->packet_life_time; - dst->preference = src->preference; - dst->packet_life_time_selector = src->packet_life_time_selector; - - /* TODO: No need to set this */ - sa_path_set_dmac_zero(dst); -} -EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index cbc72312eb41..13776a66e2e4 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -40,14 +40,17 @@ #include "uverbs.h" static int uverbs_free_ah(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { - return rdma_destroy_ah((struct ib_ah *)uobject->object, - RDMA_DESTROY_AH_SLEEPABLE); + return rdma_destroy_ah_user((struct ib_ah *)uobject->object, + RDMA_DESTROY_AH_SLEEPABLE, + &attrs->driver_udata); } static int uverbs_free_flow(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_flow *flow = (struct ib_flow *)uobject->object; struct ib_uflow_object *uflow = @@ -66,150 +69,101 @@ static int uverbs_free_flow(struct ib_uobject *uobject, } static int uverbs_free_mw(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { return uverbs_dealloc_mw((struct ib_mw *)uobject->object); } -static int uverbs_free_qp(struct ib_uobject *uobject, - enum rdma_remove_reason why) -{ - struct ib_qp *qp = uobject->object; - struct ib_uqp_object *uqp = - container_of(uobject, struct ib_uqp_object, uevent.uobject); - int ret; - - /* - * If this is a user triggered destroy then do not allow destruction - * until the user cleans up all the mcast bindings. Unlike in other - * places we forcibly clean up the mcast attachments for !DESTROY - * because the mcast attaches are not ubojects and will not be - * destroyed by anything else during cleanup processing. - */ - if (why == RDMA_REMOVE_DESTROY) { - if (!list_empty(&uqp->mcast_list)) - return -EBUSY; - } else if (qp == qp->real_qp) { - ib_uverbs_detach_umcast(qp, uqp); - } - - ret = ib_destroy_qp(qp); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - if (uqp->uxrcd) - atomic_dec(&uqp->uxrcd->refcnt); - - ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent); - return ret; -} - static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object; struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; - int ret; - - ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; + u32 table_size = (1 << rwq_ind_tbl->log_ind_tbl_size); + int ret, i; - kfree(ind_tbl); - return ret; -} - -static int uverbs_free_wq(struct ib_uobject *uobject, - enum rdma_remove_reason why) -{ - struct ib_wq *wq = uobject->object; - struct ib_uwq_object *uwq = - container_of(uobject, struct ib_uwq_object, uevent.uobject); - int ret; - - ret = ib_destroy_wq(wq); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; + if (atomic_read(&rwq_ind_tbl->usecnt)) + return -EBUSY; - ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent); - return ret; -} - -static int uverbs_free_srq(struct ib_uobject *uobject, - enum rdma_remove_reason why) -{ - struct ib_srq *srq = uobject->object; - struct ib_uevent_object *uevent = - container_of(uobject, struct ib_uevent_object, uobject); - enum ib_srq_type srq_type = srq->srq_type; - int ret; - - ret = ib_destroy_srq(srq); - if (ib_is_destroy_retryable(ret, why, uobject)) + ret = rwq_ind_tbl->device->ops.destroy_rwq_ind_table(rwq_ind_tbl); + if (ret) return ret; - if (srq_type == IB_SRQT_XRC) { - struct ib_usrq_object *us = - container_of(uevent, struct ib_usrq_object, uevent); + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); - atomic_dec(&us->uxrcd->refcnt); - } - - ib_uverbs_release_uevent(uobject->context->ufile, uevent); - return ret; + kfree(rwq_ind_tbl); + kfree(ind_tbl); + return 0; } static int uverbs_free_xrcd(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_xrcd *xrcd = uobject->object; struct ib_uxrcd_object *uxrcd = container_of(uobject, struct ib_uxrcd_object, uobject); int ret; - ret = ib_destroy_usecnt(&uxrcd->refcnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&uxrcd->refcnt)) + return -EBUSY; - mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex); - ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why); - mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex); + mutex_lock(&attrs->ufile->device->xrcd_tree_mutex); + ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs); + mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex); return ret; } static int uverbs_free_pd(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_pd *pd = uobject->object; - int ret; - ret = ib_destroy_usecnt(&pd->usecnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&pd->usecnt)) + return -EBUSY; - ib_dealloc_pd((struct ib_pd *)uobject->object); - return 0; + return ib_dealloc_pd_user(pd, &attrs->driver_udata); } -static int uverbs_hot_unplug_completion_event_file(struct ib_uobject *uobj, - enum rdma_remove_reason why) +void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue) { - struct ib_uverbs_completion_event_file *comp_event_file = - container_of(uobj, struct ib_uverbs_completion_event_file, - uobj); - struct ib_uverbs_event_queue *event_queue = &comp_event_file->ev_queue; + struct ib_uverbs_event *entry, *tmp; spin_lock_irq(&event_queue->lock); + /* + * The user must ensure that no new items are added to the event_list + * once is_closed is set. + */ event_queue->is_closed = 1; spin_unlock_irq(&event_queue->lock); + wake_up_interruptible(&event_queue->poll_wait); + kill_fasync(&event_queue->async_queue, SIGIO, POLL_IN); - if (why == RDMA_REMOVE_DRIVER_REMOVE) { - wake_up_interruptible(&event_queue->poll_wait); - kill_fasync(&event_queue->async_queue, SIGIO, POLL_IN); + spin_lock_irq(&event_queue->lock); + list_for_each_entry_safe(entry, tmp, &event_queue->event_list, list) { + if (entry->counter) + list_del(&entry->obj_list); + list_del(&entry->list); + kfree(entry); } - return 0; -}; + spin_unlock_irq(&event_queue->lock); +} + +static void +uverbs_completion_event_file_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct ib_uverbs_completion_event_file *file = + container_of(uobj, struct ib_uverbs_completion_event_file, + uobj); + + ib_uverbs_free_event_queue(&file->ev_queue); +} int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) { @@ -220,15 +174,11 @@ EXPORT_SYMBOL(uverbs_destroy_def_handler); DECLARE_UVERBS_NAMED_OBJECT( UVERBS_OBJECT_COMP_CHANNEL, UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), - uverbs_hot_unplug_completion_event_file, + uverbs_completion_event_file_destroy_uobj, &uverbs_event_fops, "[infinibandevent]", O_RDONLY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_QP, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MW_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE, @@ -240,11 +190,6 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw), &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_SRQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), - uverbs_free_srq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_AH_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE, @@ -269,10 +214,6 @@ DECLARE_UVERBS_NAMED_OBJECT( uverbs_free_flow), &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_WQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_RWQ_IND_TBL_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE, @@ -313,18 +254,12 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL, UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, - UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH, UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW, UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW, UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED( UVERBS_OBJECT_RWQ_IND_TBL, UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)), diff --git a/drivers/infiniband/core/uverbs_std_types_async_fd.c b/drivers/infiniband/core/uverbs_std_types_async_fd.c new file mode 100644 index 000000000000..cc24cfdf7aee --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_async_fd.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_std_types.h> +#include <rdma/uverbs_ioctl.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int UVERBS_HANDLER(UVERBS_METHOD_ASYNC_EVENT_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_METHOD_ASYNC_EVENT_ALLOC); + + ib_uverbs_init_async_event_file( + container_of(uobj, struct ib_uverbs_async_event_file, uobj)); + return 0; +} + +static void uverbs_async_event_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct ib_uverbs_async_event_file *event_file = + container_of(uobj, struct ib_uverbs_async_event_file, uobj); + + ib_unregister_event_handler(&event_file->event_handler); + + if (why == RDMA_REMOVE_DRIVER_REMOVE) + ib_uverbs_async_handler(event_file, 0, IB_EVENT_DEVICE_FATAL, + NULL, NULL); +} + +int uverbs_async_event_release(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_async_event_file *event_file; + struct ib_uobject *uobj = filp->private_data; + int ret; + + if (!uobj) + return uverbs_uobject_fd_release(inode, filp); + + event_file = + container_of(uobj, struct ib_uverbs_async_event_file, uobj); + + /* + * The async event FD has to deliver IB_EVENT_DEVICE_FATAL even after + * disassociation, so cleaning the event list must only happen after + * release. The user knows it has reached the end of the event stream + * when it sees IB_EVENT_DEVICE_FATAL. + */ + uverbs_uobject_get(uobj); + ret = uverbs_uobject_fd_release(inode, filp); + ib_uverbs_free_event_queue(&event_file->ev_queue); + uverbs_uobject_put(uobj); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_ASYNC_EVENT_ALLOC, + UVERBS_ATTR_FD(UVERBS_ATTR_ASYNC_EVENT_ALLOC_FD_HANDLE, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_NEW, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_async_event_file), + uverbs_async_event_destroy_uobj, + &uverbs_async_event_fops, + "[infinibandevent]", + O_RDONLY), + &UVERBS_METHOD(UVERBS_METHOD_ASYNC_EVENT_ALLOC)); + +const struct uapi_definition uverbs_def_obj_async_fd[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_ASYNC_EVENT), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c index 309c5e80988d..381aa5797641 100644 --- a/drivers/infiniband/core/uverbs_std_types_counters.c +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -31,20 +31,25 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> static int uverbs_free_counters(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_counters *counters = uobject->object; int ret; - ret = ib_destroy_usecnt(&counters->usecnt, why, uobject); + if (atomic_read(&counters->usecnt)) + return -EBUSY; + + ret = counters->device->ops.destroy_counters(counters); if (ret) return ret; - - return counters->device->ops.destroy_counters(counters); + kfree(counters); + return 0; } static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( @@ -52,7 +57,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( { struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); - struct ib_device *ib_dev = uobj->context->device; + struct ib_device *ib_dev = attrs->context->device; struct ib_counters *counters; int ret; @@ -64,20 +69,19 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( if (!ib_dev->ops.create_counters) return -EOPNOTSUPP; - counters = ib_dev->ops.create_counters(ib_dev, attrs); - if (IS_ERR(counters)) { - ret = PTR_ERR(counters); - goto err_create_counters; - } + counters = rdma_zalloc_drv_obj(ib_dev, ib_counters); + if (!counters) + return -ENOMEM; counters->device = ib_dev; counters->uobject = uobj; uobj->object = counters; atomic_set(&counters->usecnt, 0); - return 0; + ret = ib_dev->ops.create_counters(counters, attrs); + if (ret) + kfree(counters); -err_create_counters: return ret; } @@ -103,6 +107,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)( return ret; uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF); + if (IS_ERR(uattr)) + return PTR_ERR(uattr); read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64); read_attr.counters_buff = uverbs_zalloc( attrs, array_size(read_attr.ncounters, sizeof(u64))); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index a59ea89e3f2b..fab5d914029d 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -33,28 +33,29 @@ #include <rdma/uverbs_std_types.h> #include "rdma_core.h" #include "uverbs.h" +#include "restrack.h" static int uverbs_free_cq(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_cq *cq = uobject->object; struct ib_uverbs_event_queue *ev_queue = cq->cq_context; struct ib_ucq_object *ucq = - container_of(uobject, struct ib_ucq_object, uobject); + container_of(uobject, struct ib_ucq_object, uevent.uobject); int ret; - ret = ib_destroy_cq(cq); - if (ib_is_destroy_retryable(ret, why, uobject)) + ret = ib_destroy_cq_user(cq, &attrs->driver_udata); + if (ret) return ret; ib_uverbs_release_ucq( - uobject->context->ufile, ev_queue ? container_of(ev_queue, struct ib_uverbs_completion_event_file, ev_queue) : NULL, ucq); - return ret; + return 0; } static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( @@ -62,16 +63,22 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( { struct ib_ucq_object *obj = container_of( uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE), - typeof(*obj), uobject); - struct ib_device *ib_dev = obj->uobject.context->device; - int ret; - u64 user_handle; + typeof(*obj), uevent.uobject); + struct ib_uverbs_completion_event_file *ev_file = NULL; + struct ib_device *ib_dev = attrs->context->device; + struct ib_umem_dmabuf *umem_dmabuf; struct ib_cq_init_attr attr = {}; - struct ib_cq *cq; - struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_uobject *ev_file_uobj; + struct ib_umem *umem = NULL; + u64 buffer_length; + u64 buffer_offset; + struct ib_cq *cq; + u64 user_handle; + u64 buffer_va; + int buffer_fd; + int ret; - if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq) + if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq) return -EOPNOTSUPP; ret = uverbs_copy_from(&attr.comp_vector, attrs, @@ -100,44 +107,111 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( uverbs_uobject_get(ev_file_uobj); } + obj->uevent.event_file = ib_uverbs_get_async_event( + attrs, UVERBS_ATTR_CREATE_CQ_EVENT_FD); + if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) { ret = -EINVAL; goto err_event_file; } - obj->comp_events_reported = 0; - obj->async_events_reported = 0; INIT_LIST_HEAD(&obj->comp_list); - INIT_LIST_HEAD(&obj->async_list); + INIT_LIST_HEAD(&obj->uevent.event_list); + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) { - cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context, - &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + ret = PTR_ERR(umem); + goto err_event_file; + } + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) { + + ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length, + buffer_fd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem_dmabuf)) { + ret = PTR_ERR(umem_dmabuf); + goto err_event_file; + } + umem = &umem_dmabuf->umem; + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) || + !ib_dev->ops.create_cq) { + ret = -EINVAL; + goto err_event_file; + } + + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; + ib_umem_release(umem); goto err_event_file; } cq->device = ib_dev; - cq->uobject = &obj->uobject; + cq->uobject = obj; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; - obj->uobject.object = cq; - obj->uobject.user_handle = user_handle; atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_uadd(&cq->res); - ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, - sizeof(cq->cqe)); + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); + + ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) : + ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) - goto err_cq; + goto err_free; - return 0; -err_cq: - ib_destroy_cq(cq); + obj->uevent.uobject.object = cq; + obj->uevent.uobject.user_handle = user_handle; + rdma_restrack_add(&cq->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE); + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, + sizeof(cq->cqe)); + return ret; + +err_free: + ib_umem_release(umem); + rdma_restrack_put(&cq->res); + kfree(cq); err_event_file: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); if (ev_file) uverbs_uobject_put(ev_file_uobj); return ret; @@ -167,6 +241,21 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), UVERBS_ATTR_UHW()); static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( @@ -175,10 +264,10 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE); struct ib_ucq_object *obj = - container_of(uobj, struct ib_ucq_object, uobject); + container_of(uobj, struct ib_ucq_object, uevent.uobject); struct ib_uverbs_destroy_cq_resp resp = { .comp_events_reported = obj->comp_events_reported, - .async_events_reported = obj->async_events_reported + .async_events_reported = obj->uevent.events_reported }; return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp, @@ -198,11 +287,8 @@ DECLARE_UVERBS_NAMED_METHOD( DECLARE_UVERBS_NAMED_OBJECT( UVERBS_OBJECT_CQ, UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq), - -#if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI) &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) -#endif ); const struct uapi_definition uverbs_def_obj_cq[] = { diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c index 5030ec480370..c0fd283d9d6c 100644 --- a/drivers/infiniband/core/uverbs_std_types_device.c +++ b/drivers/infiniband/core/uverbs_std_types_device.c @@ -3,11 +3,13 @@ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. */ +#include <linux/overflow.h> #include <rdma/uverbs_std_types.h> #include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_ioctl.h> #include <rdma/opa_addr.h> +#include <rdma/ib_cache.h> /* * This ioctl method allows calling any defined write or write_ex @@ -38,7 +40,12 @@ static int UVERBS_HANDLER(UVERBS_METHOD_INVOKE_WRITE)( attrs->ucore.outlen < method_elm->resp_size) return -ENOSPC; - return method_elm->handler(attrs); + attrs->uobject = NULL; + rc = method_elm->handler(attrs); + if (attrs->uobject) + uverbs_finalize_object(attrs->uobject, UVERBS_ACCESS_NEW, true, + !rc, attrs); + return rc; } DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE, @@ -110,8 +117,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_INFO_HANDLES)( return ret; uapi_object = uapi_get_object(attrs->ufile->device->uapi, object_id); - if (!uapi_object) - return -EINVAL; + if (IS_ERR(uapi_object)) + return PTR_ERR(uapi_object); handles = gather_objects_handle(attrs->ufile, uapi_object, attrs, out_len, &total); @@ -160,7 +167,8 @@ void copy_port_attr_to_resp(struct ib_port_attr *attr, resp->subnet_timeout = attr->subnet_timeout; resp->init_type_reply = attr->init_type_reply; resp->active_width = attr->active_width; - resp->active_speed = attr->active_speed; + /* This ABI needs to be extended to provide any speed more than IB_SPEED_NDR */ + resp->active_speed = min_t(u16, attr->active_speed, IB_SPEED_NDR); resp->phys_state = attr->phys_state; resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num); } @@ -168,12 +176,18 @@ void copy_port_attr_to_resp(struct ib_port_attr *attr, static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( struct uverbs_attr_bundle *attrs) { - struct ib_device *ib_dev = attrs->ufile->device->ib_dev; + struct ib_device *ib_dev; struct ib_port_attr attr = {}; struct ib_uverbs_query_port_resp_ex resp = {}; + struct ib_ucontext *ucontext; int ret; u8 port_num; + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */ if (!ib_dev->ops.query_port) return -EOPNOTSUPP; @@ -189,11 +203,253 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num); resp.port_cap_flags2 = attr.port_cap_flags2; + resp.active_speed_ex = attr.active_speed; return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP, &resp, sizeof(resp)); } +static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( + struct uverbs_attr_bundle *attrs) +{ + u32 num_comp = attrs->ufile->device->num_comp_vectors; + u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + int ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, + &num_comp, sizeof(num_comp)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, + &core_support, sizeof(core_support)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = ib_alloc_ucontext(attrs); + if (ret) + return ret; + ret = ib_init_ucontext(attrs); + if (ret) { + kfree(attrs->context); + attrs->context = NULL; + return ret; + } + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)( + struct uverbs_attr_bundle *attrs) +{ + u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 num_comp; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_ucontext) + return -EOPNOTSUPP; + + num_comp = attrs->ufile->device->num_comp_vectors; + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + &num_comp, sizeof(num_comp)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + &core_support, sizeof(core_support)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + return ucontext->device->ops.query_ucontext(ucontext, attrs); +} + +static int copy_gid_entries_to_user(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_gid_entry *entries, + size_t num_entries, size_t user_entry_size) +{ + const struct uverbs_attr *attr; + void __user *user_entries; + size_t copy_len; + int ret; + int i; + + if (user_entry_size == sizeof(*entries)) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + entries, sizeof(*entries) * num_entries); + return ret; + } + + copy_len = min_t(size_t, user_entry_size, sizeof(*entries)); + attr = uverbs_attr_get(attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); + if (IS_ERR(attr)) + return PTR_ERR(attr); + + user_entries = u64_to_user_ptr(attr->ptr_attr.data); + for (i = 0; i < num_entries; i++) { + if (copy_to_user(user_entries, entries, copy_len)) + return -EFAULT; + + if (user_entry_size > sizeof(*entries)) { + if (clear_user(user_entries + sizeof(*entries), + user_entry_size - sizeof(*entries))) + return -EFAULT; + } + + entries++; + user_entries += user_entry_size; + } + + return uverbs_output_written(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_TABLE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry *entries; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + size_t user_entry_size; + ssize_t num_entries; + int max_entries; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&user_entry_size, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE); + if (ret) + return ret; + + if (!user_entry_size) + return -EINVAL; + + max_entries = uverbs_attr_ptr_get_array_size( + attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + user_entry_size); + if (max_entries <= 0) + return max_entries ?: -EINVAL; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + entries = uverbs_kcalloc(attrs, max_entries, sizeof(*entries)); + if (IS_ERR(entries)) + return PTR_ERR(entries); + + num_entries = rdma_query_gid_table(ib_dev, entries, max_entries); + if (num_entries < 0) + return -EINVAL; + + ret = copy_gid_entries_to_user(attrs, entries, num_entries, + user_entry_size); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + &num_entries, sizeof(num_entries)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_ENTRY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry entry = {}; + const struct ib_gid_attr *gid_attr; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + struct net_device *ndev; + u32 gid_index; + u32 port_num; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_PORT); + if (ret) + return ret; + + ret = uverbs_get_const(&gid_index, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX); + if (ret) + return ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!rdma_is_port_valid(ib_dev, port_num)) + return -EINVAL; + + gid_attr = rdma_get_gid_attr(ib_dev, port_num, gid_index); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + + memcpy(&entry.gid, &gid_attr->gid, sizeof(gid_attr->gid)); + entry.gid_index = gid_attr->index; + entry.port_num = gid_attr->port_num; + entry.gid_type = gid_attr->gid_type; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(gid_attr); + if (IS_ERR(ndev)) { + if (PTR_ERR(ndev) != -ENODEV) { + ret = PTR_ERR(ndev); + rcu_read_unlock(); + goto out; + } + } else { + entry.netdev_ifindex = ndev->ifindex; + } + rcu_read_unlock(); + + ret = uverbs_copy_to_struct_or_zero( + attrs, UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, &entry, + sizeof(entry)); +out: + rdma_put_gid_attr(gid_attr); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_GET_CONTEXT, + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_TYPE(u64), UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_GET_CONTEXT_FD_ARR, + UVERBS_ATTR_MIN_SIZE(sizeof(int)), + UA_OPTIONAL, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_UHW()); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_TYPE(u64), UA_OPTIONAL)); + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_INFO_HANDLES, /* Also includes any device specific object ids */ @@ -210,13 +466,41 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_OUT( UVERBS_ATTR_QUERY_PORT_RESP, UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex, - reserved), + active_speed_ex), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_TABLE, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, u32, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + UVERBS_ATTR_MIN_SIZE(0), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_ENTRY, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_PORT, u32, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX, u32, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, u32, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, + UVERBS_ATTR_STRUCT(struct ib_uverbs_gid_entry, + netdev_ifindex), + UA_MANDATORY)); + DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, + &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT), &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), - &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT)); + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY)); const struct uapi_definition uverbs_def_obj_device[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE), diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c index 2ef70637bee1..98c522cf86d6 100644 --- a/drivers/infiniband/core/uverbs_std_types_dm.c +++ b/drivers/infiniband/core/uverbs_std_types_dm.c @@ -30,20 +30,20 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> static int uverbs_free_dm(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_dm *dm = uobject->object; - int ret; - ret = ib_destroy_usecnt(&dm->usecnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&dm->usecnt)) + return -EBUSY; - return dm->device->ops.dealloc_dm(dm); + return dm->device->ops.dealloc_dm(dm, attrs); } static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( @@ -53,7 +53,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( struct ib_uobject *uobj = uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE) ->obj_attr.uobject; - struct ib_device *ib_dev = uobj->context->device; + struct ib_device *ib_dev = attrs->context->device; struct ib_dm *dm; int ret; @@ -70,7 +70,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( if (ret) return ret; - dm = ib_dev->ops.alloc_dm(ib_dev, uobj->context, &attr, attrs); + dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs); if (IS_ERR(dm)) return PTR_ERR(dm); diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c new file mode 100644 index 000000000000..453ce656c6f2 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_dmah.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include <rdma/uverbs_std_types.h> +#include "restrack.h" + +static int uverbs_free_dmah(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_dmah *dmah = uobject->object; + int ret; + + if (atomic_read(&dmah->usecnt)) + return -EBUSY; + + ret = dmah->device->ops.dealloc_dmah(dmah, attrs); + if (ret) + return ret; + + rdma_restrack_del(&dmah->res); + kfree(dmah); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE) + ->obj_attr.uobject; + struct ib_device *ib_dev = attrs->context->device; + struct ib_dmah *dmah; + int ret; + + dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah); + if (!dmah) + return -ENOMEM; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) { + ret = uverbs_copy_from(&dmah->cpu_id, attrs, + UVERBS_ATTR_ALLOC_DMAH_CPU_ID); + if (ret) + goto err; + + if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) { + ret = -EPERM; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) { + dmah->mem_type = uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE); + dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) { + ret = uverbs_copy_from(&dmah->ph, attrs, + UVERBS_ATTR_ALLOC_DMAH_PH); + if (ret) + goto err; + + /* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */ + if (dmah->ph & 0xFC) { + ret = -EINVAL; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS); + } + + dmah->device = ib_dev; + dmah->uobject = uobj; + atomic_set(&dmah->usecnt, 0); + + rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH); + rdma_restrack_set_name(&dmah->res, NULL); + + ret = ib_dev->ops.alloc_dmah(dmah, attrs); + if (ret) { + rdma_restrack_put(&dmah->res); + goto err; + } + + uobj->object = dmah; + rdma_restrack_add(&dmah->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE); + return 0; +err: + kfree(dmah); + return ret; +} + +static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = { + [TPH_MEM_TYPE_VM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [TPH_MEM_TYPE_PM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DMAH_ALLOC, + UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE, + uverbs_dmah_mem_type, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH, + UVERBS_ATTR_TYPE(u8), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_DMAH_FREE, + UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE)); + +const struct uapi_definition uverbs_def_obj_dmah[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah), + UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index 4962b87fa600..0ddcf6da66c4 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -30,401 +30,22 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> static int uverbs_free_flow_action(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_flow_action *action = uobject->object; - int ret; - ret = ib_destroy_usecnt(&action->usecnt, why, uobject); - if (ret) - return ret; + if (atomic_read(&action->usecnt)) + return -EBUSY; return action->device->ops.destroy_flow_action(action); } -static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs, - u32 flags, bool is_modify) -{ - u64 verbs_flags = flags; - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN)) - verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED; - - if (is_modify && uverbs_attr_is_valid(attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) - verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS; - - return verbs_flags; -}; - -static int validate_flow_action_esp_keymat_aes_gcm(struct ib_flow_action_attrs_esp_keymats *keymat) -{ - struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm = - &keymat->keymat.aes_gcm; - - if (aes_gcm->iv_algo > IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) - return -EOPNOTSUPP; - - if (aes_gcm->key_len != 32 && - aes_gcm->key_len != 24 && - aes_gcm->key_len != 16) - return -EINVAL; - - if (aes_gcm->icv_len != 16 && - aes_gcm->icv_len != 8 && - aes_gcm->icv_len != 12) - return -EINVAL; - - return 0; -} - -static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_attrs_esp_keymats *keymat) = { - [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm, -}; - -static int flow_action_esp_replay_none(struct ib_flow_action_attrs_esp_replays *replay, - bool is_modify) -{ - /* This is used in order to modify an esp flow action with an enabled - * replay protection to a disabled one. This is only supported via - * modify, as in create verb we can simply drop the REPLAY attribute and - * achieve the same thing. - */ - return is_modify ? 0 : -EINVAL; -} - -static int flow_action_esp_replay_def_ok(struct ib_flow_action_attrs_esp_replays *replay, - bool is_modify) -{ - /* Some replay protections could always be enabled without validating - * anything. - */ - return 0; -} - -static int (* const flow_action_esp_replay_validate[])(struct ib_flow_action_attrs_esp_replays *replay, - bool is_modify) = { - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = flow_action_esp_replay_none, - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = flow_action_esp_replay_def_ok, -}; - -static int parse_esp_ip(enum ib_flow_spec_type proto, - const void __user *val_ptr, - size_t len, union ib_flow_spec *out) -{ - int ret; - const struct ib_uverbs_flow_ipv4_filter ipv4 = { - .src_ip = cpu_to_be32(0xffffffffUL), - .dst_ip = cpu_to_be32(0xffffffffUL), - .proto = 0xff, - .tos = 0xff, - .ttl = 0xff, - .flags = 0xff, - }; - const struct ib_uverbs_flow_ipv6_filter ipv6 = { - .src_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - .dst_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - .flow_label = cpu_to_be32(0xffffffffUL), - .next_hdr = 0xff, - .traffic_class = 0xff, - .hop_limit = 0xff, - }; - union { - struct ib_uverbs_flow_ipv4_filter ipv4; - struct ib_uverbs_flow_ipv6_filter ipv6; - } user_val = {}; - const void *user_pmask; - size_t val_len; - - /* If the flow IPv4/IPv6 flow specifications are extended, the mask - * should be changed as well. - */ - BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv4_filter, flags) + - sizeof(ipv4.flags) != sizeof(ipv4)); - BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv6_filter, reserved) + - sizeof(ipv6.reserved) != sizeof(ipv6)); - - switch (proto) { - case IB_FLOW_SPEC_IPV4: - if (len > sizeof(user_val.ipv4) && - !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv4), - len - sizeof(user_val.ipv4))) - return -EOPNOTSUPP; - - val_len = min_t(size_t, len, sizeof(user_val.ipv4)); - ret = copy_from_user(&user_val.ipv4, val_ptr, - val_len); - if (ret) - return -EFAULT; - - user_pmask = &ipv4; - break; - case IB_FLOW_SPEC_IPV6: - if (len > sizeof(user_val.ipv6) && - !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv6), - len - sizeof(user_val.ipv6))) - return -EOPNOTSUPP; - - val_len = min_t(size_t, len, sizeof(user_val.ipv6)); - ret = copy_from_user(&user_val.ipv6, val_ptr, - val_len); - if (ret) - return -EFAULT; - - user_pmask = &ipv6; - break; - default: - return -EOPNOTSUPP; - } - - return ib_uverbs_kern_spec_to_ib_spec_filter(proto, user_pmask, - &user_val, - val_len, out); -} - -static int flow_action_esp_get_encap(struct ib_flow_spec_list *out, - struct uverbs_attr_bundle *attrs) -{ - struct ib_uverbs_flow_action_esp_encap uverbs_encap; - int ret; - - ret = uverbs_copy_from(&uverbs_encap, attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP); - if (ret) - return ret; - - /* We currently support only one encap */ - if (uverbs_encap.next_ptr) - return -EOPNOTSUPP; - - if (uverbs_encap.type != IB_FLOW_SPEC_IPV4 && - uverbs_encap.type != IB_FLOW_SPEC_IPV6) - return -EOPNOTSUPP; - - return parse_esp_ip(uverbs_encap.type, - u64_to_user_ptr(uverbs_encap.val_ptr), - uverbs_encap.len, - &out->spec); -} - -struct ib_flow_action_esp_attr { - struct ib_flow_action_attrs_esp hdr; - struct ib_flow_action_attrs_esp_keymats keymat; - struct ib_flow_action_attrs_esp_replays replay; - /* We currently support only one spec */ - struct ib_flow_spec_list encap; -}; - -#define ESP_LAST_SUPPORTED_FLAG IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW -static int parse_flow_action_esp(struct ib_device *ib_dev, - struct uverbs_attr_bundle *attrs, - struct ib_flow_action_esp_attr *esp_attr, - bool is_modify) -{ - struct ib_uverbs_flow_action_esp uverbs_esp = {}; - int ret; - - /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ - ret = uverbs_copy_from(&esp_attr->hdr.esn, attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ESN); - if (IS_UVERBS_COPY_ERR(ret)) - return ret; - - /* This can be called from FLOW_ACTION_ESP_MODIFY where - * UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS is optional - */ - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) { - ret = uverbs_copy_from_or_zero(&uverbs_esp, attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS); - if (ret) - return ret; - - if (uverbs_esp.flags & ~((ESP_LAST_SUPPORTED_FLAG << 1) - 1)) - return -EOPNOTSUPP; - - esp_attr->hdr.spi = uverbs_esp.spi; - esp_attr->hdr.seq = uverbs_esp.seq; - esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad; - esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts; - } - esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags, - is_modify); - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) { - esp_attr->keymat.protocol = - uverbs_attr_get_enum_id(attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); - ret = uverbs_copy_from_or_zero(&esp_attr->keymat.keymat, - attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); - if (ret) - return ret; - - ret = flow_action_esp_keymat_validate[esp_attr->keymat.protocol](&esp_attr->keymat); - if (ret) - return ret; - - esp_attr->hdr.keymat = &esp_attr->keymat; - } - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY)) { - esp_attr->replay.protocol = - uverbs_attr_get_enum_id(attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); - - ret = uverbs_copy_from_or_zero(&esp_attr->replay.replay, - attrs, - UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); - if (ret) - return ret; - - ret = flow_action_esp_replay_validate[esp_attr->replay.protocol](&esp_attr->replay, - is_modify); - if (ret) - return ret; - - esp_attr->hdr.replay = &esp_attr->replay; - } - - if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP)) { - ret = flow_action_esp_get_encap(&esp_attr->encap, attrs); - if (ret) - return ret; - - esp_attr->hdr.encap = &esp_attr->encap; - } - - return 0; -} - -static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( - struct uverbs_attr_bundle *attrs) -{ - struct ib_uobject *uobj = uverbs_attr_get_uobject( - attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE); - struct ib_device *ib_dev = uobj->context->device; - int ret; - struct ib_flow_action *action; - struct ib_flow_action_esp_attr esp_attr = {}; - - if (!ib_dev->ops.create_flow_action_esp) - return -EOPNOTSUPP; - - ret = parse_flow_action_esp(ib_dev, attrs, &esp_attr, false); - if (ret) - return ret; - - /* No need to check as this attribute is marked as MANDATORY */ - action = ib_dev->ops.create_flow_action_esp(ib_dev, &esp_attr.hdr, - attrs); - if (IS_ERR(action)) - return PTR_ERR(action); - - uverbs_flow_action_fill_action(action, uobj, ib_dev, - IB_FLOW_ACTION_ESP); - - return 0; -} - -static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)( - struct uverbs_attr_bundle *attrs) -{ - struct ib_uobject *uobj = uverbs_attr_get_uobject( - attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE); - struct ib_flow_action *action = uobj->object; - int ret; - struct ib_flow_action_esp_attr esp_attr = {}; - - if (!action->device->ops.modify_flow_action_esp) - return -EOPNOTSUPP; - - ret = parse_flow_action_esp(action->device, attrs, &esp_attr, true); - if (ret) - return ret; - - if (action->type != IB_FLOW_ACTION_ESP) - return -EINVAL; - - return action->device->ops.modify_flow_action_esp(action, - &esp_attr.hdr, - attrs); -} - -static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = { - [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_STRUCT( - struct ib_uverbs_flow_action_esp_keymat_aes_gcm, - aes_key), - }, -}; - -static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = { - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_NO_DATA(), - }, - [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp, - size), - }, -}; - -DECLARE_UVERBS_NAMED_METHOD( - UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, - UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE, - UVERBS_OBJECT_FLOW_ACTION, - UVERBS_ACCESS_NEW, - UA_MANDATORY), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, - hard_limit_pkts), - UA_MANDATORY), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, - UVERBS_ATTR_TYPE(__u32), - UA_OPTIONAL), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, - uverbs_flow_action_esp_keymat, - UA_MANDATORY), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, - uverbs_flow_action_esp_replay, - UA_OPTIONAL), - UVERBS_ATTR_PTR_IN( - UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, - UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), - UA_OPTIONAL)); - -DECLARE_UVERBS_NAMED_METHOD( - UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, - UVERBS_ATTR_IDR(UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE, - UVERBS_OBJECT_FLOW_ACTION, - UVERBS_ACCESS_WRITE, - UA_MANDATORY), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, - hard_limit_pkts), - UA_OPTIONAL), - UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, - UVERBS_ATTR_TYPE(__u32), - UA_OPTIONAL), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, - uverbs_flow_action_esp_keymat, - UA_OPTIONAL), - UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, - uverbs_flow_action_esp_replay, - UA_OPTIONAL), - UVERBS_ATTR_PTR_IN( - UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, - UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), - UA_OPTIONAL)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_FLOW_ACTION_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, @@ -435,9 +56,7 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY( DECLARE_UVERBS_NAMED_OBJECT( UVERBS_OBJECT_FLOW_ACTION, UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)); + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY)); const struct uapi_definition uverbs_def_obj_flow_action[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 4d4be0c2b752..570b9656801d 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,13 +31,17 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> +#include "restrack.h" static int uverbs_free_mr(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { - return ib_dereg_mr((struct ib_mr *)uobject->object); + return ib_dereg_mr_user((struct ib_mr *)uobject->object, + &attrs->driver_udata); } static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)( @@ -66,7 +71,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)( num_sge = uverbs_attr_ptr_get_array_size( attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge)); - if (num_sge < 0) + if (num_sge <= 0) return num_sge; sg_list = uverbs_attr_get_alloced_ptr(attrs, @@ -111,7 +116,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( if (!(attr.access_flags & IB_ZERO_BASED)) return -EINVAL; - ret = ib_check_mr_access(attr.access_flags); + ret = ib_check_mr_access(ib_dev, attr.access_flags); if (ret) return ret; @@ -125,28 +130,268 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_DM; mr->dm = dm; mr->uobject = uobj; atomic_inc(&pd->usecnt); atomic_inc(&dm->usecnt); + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); uobj->object = mr; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE); + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey, sizeof(mr->lkey)); if (ret) - goto err_dereg; + return ret; ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY, &mr->rkey, sizeof(mr->rkey)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_mr *mr = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_QUERY_MR_HANDLE); + int ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LKEY, &mr->lkey, + sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + &mr->length, sizeof(mr->length)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_IOVA, + &mr->iova, sizeof(mr->iova)); + + return IS_UVERBS_COPY_ERR(ret) ? ret : 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE); + struct ib_device *ib_dev = pd->device; + + u64 offset, length, iova; + u32 fd, access_flags; + struct ib_mr *mr; + int ret; + + if (!ib_dev->ops.reg_user_mr_dmabuf) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&offset, attrs, + UVERBS_ATTR_REG_DMABUF_MR_OFFSET); + if (ret) + return ret; + + ret = uverbs_copy_from(&length, attrs, + UVERBS_ATTR_REG_DMABUF_MR_LENGTH); + if (ret) + return ret; + + ret = uverbs_copy_from(&iova, attrs, + UVERBS_ATTR_REG_DMABUF_MR_IOVA); + if (ret) + return ret; + + if ((offset & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + + ret = uverbs_copy_from(&fd, attrs, + UVERBS_ATTR_REG_DMABUF_MR_FD); + if (ret) + return ret; + + ret = uverbs_get_flags32(&access_flags, attrs, + UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_RELAXED_ORDERING); + if (ret) + return ret; + + ret = ib_check_mr_access(ib_dev, access_flags); + if (ret) + return ret; + + mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, + access_flags, NULL, + attrs); + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; + + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY, + &mr->lkey, sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE); + u32 valid_access_flags = IB_ACCESS_SUPPORTED; + u64 length, iova, fd_offset = 0, addr = 0; + struct ib_device *ib_dev = pd->device; + struct ib_dmah *dmah = NULL; + bool has_fd_offset = false; + bool has_addr = false; + bool has_fd = false; + u32 access_flags; + struct ib_mr *mr; + int fd; + int ret; + + ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA); if (ret) - goto err_dereg; + return ret; + + ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) { + ret = uverbs_copy_from(&addr, attrs, + UVERBS_ATTR_REG_MR_ADDR); + if (ret) + return ret; + has_addr = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) { + ret = uverbs_copy_from(&fd_offset, attrs, + UVERBS_ATTR_REG_MR_FD_OFFSET); + if (ret) + return ret; + has_fd_offset = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) { + ret = uverbs_get_raw_fd(&fd, attrs, + UVERBS_ATTR_REG_MR_FD); + if (ret) + return ret; + has_fd = true; + } + + if (has_fd) { + if (!ib_dev->ops.reg_user_mr_dmabuf) + return -EOPNOTSUPP; + + /* FD requires offset and can't come with addr */ + if (!has_fd_offset || has_addr) + return -EINVAL; + + if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + + valid_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_RELAXED_ORDERING; + } else { + if (!has_addr || has_fd_offset) + return -EINVAL; + + if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) { + dmah = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_REG_MR_DMA_HANDLE); + if (IS_ERR(dmah)) + return PTR_ERR(dmah); + } + + ret = uverbs_get_flags32(&access_flags, attrs, + UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + valid_access_flags); + if (ret) + return ret; + + ret = ib_check_mr_access(ib_dev, access_flags); + if (ret) + return ret; + + if (has_fd) + mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, + iova, fd, access_flags, + dmah, attrs); + else + mr = pd->device->ops.reg_user_mr(pd, addr, length, iova, + access_flags, dmah, NULL); - return 0; + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + if (dmah) { + mr->dmah = dmah; + atomic_inc(&dmah->usecnt); + } + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; -err_dereg: - ib_dereg_mr(mr); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE); + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY, + &mr->lkey, sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); return ret; } @@ -168,6 +413,25 @@ DECLARE_UVERBS_NAMED_METHOD( UA_ALLOC_AND_COPY)); DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_DM_MR_REG, UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_OBJECT_MR, @@ -196,6 +460,75 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_FD, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum ib_access_flags), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_REG_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + enum ib_access_flags, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MR_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE, @@ -206,9 +539,12 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY( DECLARE_UVERBS_NAMED_OBJECT( UVERBS_OBJECT_MR, UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr), + &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR), &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), - &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR)); + &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR), + &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR), + &UVERBS_METHOD(UVERBS_METHOD_REG_MR)); const struct uapi_definition uverbs_def_obj_mr[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c new file mode 100644 index 000000000000..be0730e8509e --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_std_types.h> +#include "rdma_core.h" +#include "uverbs.h" +#include "core_priv.h" + +static int uverbs_free_qp(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_qp *qp = uobject->object; + struct ib_uqp_object *uqp = + container_of(uobject, struct ib_uqp_object, uevent.uobject); + int ret; + + /* + * If this is a user triggered destroy then do not allow destruction + * until the user cleans up all the mcast bindings. Unlike in other + * places we forcibly clean up the mcast attachments for !DESTROY + * because the mcast attaches are not ubojects and will not be + * destroyed by anything else during cleanup processing. + */ + if (why == RDMA_REMOVE_DESTROY) { + if (!list_empty(&uqp->mcast_list)) + return -EBUSY; + } else if (qp == qp->real_qp) { + ib_uverbs_detach_umcast(qp, uqp); + } + + ret = ib_destroy_qp_user(qp, &attrs->driver_udata); + if (ret) + return ret; + + if (uqp->uxrcd) + atomic_dec(&uqp->uxrcd->refcnt); + + ib_uverbs_release_uevent(&uqp->uevent); + return 0; +} + +static int check_creation_flags(enum ib_qp_type qp_type, + u32 create_flags) +{ + create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + + if (!create_flags || qp_type == IB_QPT_DRIVER) + return 0; + + if (qp_type != IB_QPT_RAW_PACKET && qp_type != IB_QPT_UD) + return -EINVAL; + + if ((create_flags & IB_UVERBS_QP_CREATE_SCATTER_FCS || + create_flags & IB_UVERBS_QP_CREATE_CVLAN_STRIPPING) && + qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + return 0; +} + +static void set_caps(struct ib_qp_init_attr *attr, + struct ib_uverbs_qp_cap *cap, bool req) +{ + if (req) { + attr->cap.max_send_wr = cap->max_send_wr; + attr->cap.max_recv_wr = cap->max_recv_wr; + attr->cap.max_send_sge = cap->max_send_sge; + attr->cap.max_recv_sge = cap->max_recv_sge; + attr->cap.max_inline_data = cap->max_inline_data; + } else { + cap->max_send_wr = attr->cap.max_send_wr; + cap->max_recv_wr = attr->cap.max_recv_wr; + cap->max_send_sge = attr->cap.max_send_sge; + cap->max_recv_sge = attr->cap.max_recv_sge; + cap->max_inline_data = attr->cap.max_inline_data; + } +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uqp_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_QP_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_qp_cap cap = {}; + struct ib_rwq_ind_table *rwq_ind_tbl = NULL; + struct ib_qp *qp; + struct ib_pd *pd = NULL; + struct ib_srq *srq = NULL; + struct ib_cq *recv_cq = NULL; + struct ib_cq *send_cq = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *xrcd_uobj = NULL; + struct ib_device *device; + u64 user_handle; + int ret; + + ret = uverbs_copy_from_or_zero(&cap, attrs, + UVERBS_ATTR_CREATE_QP_CAP); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_QP_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.qp_type, attrs, + UVERBS_ATTR_CREATE_QP_TYPE); + if (ret) + return ret; + + switch (attr.qp_type) { + case IB_QPT_XRC_TGT: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE)) + return -EINVAL; + + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!xrcd) + return -EINVAL; + device = xrcd->device; + break; + case IB_UVERBS_QPT_RAW_PACKET: + if (!rdma_uattrs_has_raw_cap(attrs)) + return -EPERM; + fallthrough; + case IB_UVERBS_QPT_RC: + case IB_UVERBS_QPT_UC: + case IB_UVERBS_QPT_UD: + case IB_UVERBS_QPT_XRC_INI: + case IB_UVERBS_QPT_DRIVER: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE) || + (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE) && + attr.qp_type == IB_QPT_XRC_INI)) + return -EINVAL; + + pd = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + rwq_ind_tbl = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE); + if (!IS_ERR(rwq_ind_tbl)) { + if (cap.max_recv_wr || cap.max_recv_sge || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE)) + return -EINVAL; + + /* send_cq is optional */ + if (cap.max_send_wr) { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + } + attr.rwq_ind_tbl = rwq_ind_tbl; + } else { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + + if (attr.qp_type != IB_QPT_XRC_INI) { + recv_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE); + if (IS_ERR(recv_cq)) + return PTR_ERR(recv_cq); + } + } + + device = pd->device; + break; + default: + return -EINVAL; + } + + ret = uverbs_get_flags32(&attr.create_flags, attrs, + UVERBS_ATTR_CREATE_QP_FLAGS, + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_UVERBS_QP_CREATE_SCATTER_FCS | + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING | + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING | + IB_UVERBS_QP_CREATE_SQ_SIG_ALL); + if (ret) + return ret; + + ret = check_creation_flags(attr.qp_type, attr.create_flags); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN)) { + ret = uverbs_copy_from(&attr.source_qpn, attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN); + if (ret) + return ret; + attr.create_flags |= IB_QP_CREATE_SOURCE_QPN; + } + + srq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE); + if (!IS_ERR(srq)) { + if ((srq->srq_type == IB_SRQT_XRC && + attr.qp_type != IB_QPT_XRC_TGT) || + (srq->srq_type != IB_SRQT_XRC && + attr.qp_type == IB_QPT_XRC_TGT)) + return -EINVAL; + attr.srq = srq; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_QP_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + obj->uevent.uobject.user_handle = user_handle; + attr.event_handler = ib_uverbs_qp_event_handler; + attr.send_cq = send_cq; + attr.recv_cq = recv_cq; + attr.xrcd = xrcd; + if (attr.create_flags & IB_UVERBS_QP_CREATE_SQ_SIG_ALL) { + /* This creation bit is uverbs one, need to mask before + * calling drivers. It was added to prevent an extra user attr + * only for that when using ioctl. + */ + attr.create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + attr.sq_sig_type = IB_SIGNAL_ALL_WR; + } else { + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + } + + set_caps(&attr, &cap, true); + mutex_init(&obj->mcast_lock); + + qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj, + KBUILD_MODNAME); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + ib_qp_usecnt_inc(qp); + + if (attr.qp_type == IB_QPT_XRC_TGT) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + } + + obj->uevent.uobject.object = qp; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_QP_HANDLE); + + set_caps(&attr, &cap, false); + ret = uverbs_copy_to_struct_or_zero(attrs, + UVERBS_ATTR_CREATE_QP_RESP_CAP, &cap, + sizeof(cap)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + &qp->qp_num, + sizeof(qp->qp_num)); + + return ret; +err_put: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_QP_TYPE, + enum ib_uverbs_qp_type, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_QP_FLAGS, + enum ib_uverbs_qp_create_flags, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_QP_HANDLE); + struct ib_uqp_object *obj = + container_of(uobj, struct ib_uqp_object, uevent.uobject); + struct ib_uverbs_destroy_qp_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_QP_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_QP_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_qp_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_QP, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp), + &UVERBS_METHOD(UVERBS_METHOD_QP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_QP_DESTROY)); + +const struct uapi_definition uverbs_def_obj_qp[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, + UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_srq.c b/drivers/infiniband/core/uverbs_std_types_srq.c new file mode 100644 index 000000000000..e5513f828bdc --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_srq.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_std_types.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_srq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_srq *srq = uobject->object; + struct ib_uevent_object *uevent = + container_of(uobject, struct ib_uevent_object, uobject); + enum ib_srq_type srq_type = srq->srq_type; + int ret; + + ret = ib_destroy_srq_user(srq, &attrs->driver_udata); + if (ret) + return ret; + + if (srq_type == IB_SRQT_XRC) { + struct ib_usrq_object *us = + container_of(uobject, struct ib_usrq_object, + uevent.uobject); + + atomic_dec(&us->uxrcd->refcnt); + } + + ib_uverbs_release_uevent(uevent); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_usrq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE); + struct ib_srq_init_attr attr = {}; + struct ib_uobject *xrcd_uobj; + struct ib_srq *srq; + u64 user_handle; + int ret; + + ret = uverbs_copy_from(&attr.attr.max_sge, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&attr.attr.max_wr, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&attr.attr.srq_limit, attrs, + UVERBS_ATTR_CREATE_SRQ_LIMIT); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.srq_type, attrs, + UVERBS_ATTR_CREATE_SRQ_TYPE); + if (ret) + return ret; + + if (ib_srq_has_cq(attr.srq_type)) { + attr.ext.cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE); + if (IS_ERR(attr.ext.cq)) + return PTR_ERR(attr.ext.cq); + } + + switch (attr.srq_type) { + case IB_UVERBS_SRQT_XRC: + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!attr.ext.xrc.xrcd) + return -EINVAL; + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + break; + case IB_UVERBS_SRQT_TM: + ret = uverbs_copy_from(&attr.ext.tag_matching.max_num_tags, + attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS); + if (ret) + return ret; + break; + case IB_UVERBS_SRQT_BASIC: + break; + default: + return -EINVAL; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + attr.event_handler = ib_uverbs_srq_event_handler; + obj->uevent.uobject.user_handle = user_handle; + + srq = ib_create_srq_user(pd, &attr, obj, &attrs->driver_udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err; + } + + obj->uevent.uobject.object = srq; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + &attr.attr.max_wr, + sizeof(attr.attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + &attr.attr.max_sge, + sizeof(attr.attr.max_sge)); + if (ret) + return ret; + + if (attr.srq_type == IB_SRQT_XRC) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + &srq->ext.xrc.srq_num, + sizeof(srq->ext.xrc.srq_num)); + if (ret) + return ret; + } + + return 0; +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + if (attr.srq_type == IB_SRQT_XRC) + atomic_dec(&obj->uxrcd->refcnt); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_SRQ_TYPE, + enum ib_uverbs_srq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_SRQ_HANDLE); + struct ib_usrq_object *obj = + container_of(uobj, struct ib_usrq_object, uevent.uobject); + struct ib_uverbs_destroy_srq_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_SRQ_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_SRQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_srq_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_SRQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), + uverbs_free_srq), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_srq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_std_types_wq.c b/drivers/infiniband/core/uverbs_std_types_wq.c new file mode 100644 index 000000000000..7ded8339346f --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_wq.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_std_types.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_wq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_wq *wq = uobject->object; + struct ib_uwq_object *uwq = + container_of(uobject, struct ib_uwq_object, uevent.uobject); + int ret; + + ret = ib_destroy_wq_user(wq, &attrs->driver_udata); + if (ret) + return ret; + + ib_uverbs_release_uevent(&uwq->uevent); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uwq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_PD_HANDLE); + struct ib_cq *cq = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_CQ_HANDLE); + struct ib_wq_init_attr wq_init_attr = {}; + struct ib_wq *wq; + u64 user_handle; + int ret; + + ret = uverbs_get_flags32(&wq_init_attr.create_flags, attrs, + UVERBS_ATTR_CREATE_WQ_FLAGS, + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING | + IB_UVERBS_WQ_FLAGS_SCATTER_FCS | + IB_UVERBS_WQ_FLAGS_DELAY_DROP | + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_sge, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_wr, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&wq_init_attr.wq_type, attrs, + UVERBS_ATTR_CREATE_WQ_TYPE); + if (ret) + return ret; + + if (wq_init_attr.wq_type != IB_WQT_RQ) + return -EINVAL; + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_WQ_EVENT_FD); + obj->uevent.uobject.user_handle = user_handle; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + wq_init_attr.wq_context = attrs->ufile; + wq_init_attr.cq = cq; + + wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); + if (IS_ERR(wq)) { + ret = PTR_ERR(wq); + goto err; + } + + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = obj; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + &wq_init_attr.max_wr, + sizeof(wq_init_attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + &wq_init_attr.max_sge, + sizeof(wq_init_attr.max_sge)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + &wq->wq_num, + sizeof(wq->wq_num)); + return ret; + +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_WQ_TYPE, + enum ib_wq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_WQ_FLAGS, + enum ib_uverbs_wq_flags, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_WQ_HANDLE); + struct ib_uwq_object *obj = + container_of(uobj, struct ib_uwq_object, uevent.uobject); + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_WQ_RESP, + &obj->uevent.events_reported, + sizeof(obj->uevent.events_reported)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_WQ_RESP, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_WQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq), + &UVERBS_METHOD(UVERBS_METHOD_WQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_WQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_wq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 9ae08e4b78a3..e00ea63175bd 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -22,6 +22,8 @@ static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size) return ERR_PTR(-EOVERFLOW); elm = kzalloc(alloc_size, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); rc = radix_tree_insert(&uapi->radix, key, elm); if (rc) { kfree(elm); @@ -77,10 +79,7 @@ static int uapi_create_write(struct uverbs_api *uapi, method_elm->is_ex = def->write.is_ex; method_elm->handler = def->func_write; - if (def->write.is_ex) - method_elm->disabled = !(ibdev->uverbs_ex_cmd_mask & - BIT_ULL(def->write.command_num)); - else + if (!def->write.is_ex) method_elm->disabled = !(ibdev->uverbs_cmd_mask & BIT_ULL(def->write.command_num)); @@ -188,13 +187,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi, obj_elm->type_attrs = obj->type_attrs; obj_elm->type_class = obj->type_attrs->type_class; /* - * Today drivers are only permitted to use idr_class - * types. They cannot use FD types because we currently have - * no way to revoke the fops pointer after device - * disassociation. + * Today drivers are only permitted to use idr_class and + * fd_class types. We can revoke the IDR types during + * disassociation, and the FD types require the driver to use + * struct file_operations.owner to prevent the driver module + * code from unloading while the file is open. This provides + * enough safety that uverbs_uobject_fd_release() will + * continue to work. Drivers using FD are responsible to + * handle disassociation of the device on their own. */ if (WARN_ON(is_driver && - obj->type_attrs->type_class != &uverbs_idr_class)) + obj->type_attrs->type_class != &uverbs_idr_class && + obj->type_attrs->type_class != &uverbs_fd_class)) return -EINVAL; } @@ -443,6 +447,9 @@ static int uapi_finalize(struct uverbs_api *uapi) uapi->num_write_ex = max_write_ex + 1; data = kmalloc_array(uapi->num_write + uapi->num_write_ex, sizeof(*uapi->write_methods), GFP_KERNEL); + if (!data) + return -ENOMEM; + for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++) data[i] = &uapi->notsupp_method; uapi->write_methods = data; @@ -513,7 +520,7 @@ static void uapi_key_okay(u32 key) count++; if (uapi_key_is_attr(key)) count++; - WARN(count != 1, "Bad count %d key=%x", count, key); + WARN(count != 1, "Bad count %u key=%x", count, key); } static void uapi_finalize_disable(struct uverbs_api *uapi) @@ -619,13 +626,18 @@ void uverbs_destroy_api(struct uverbs_api *uapi) } static const struct uapi_definition uverbs_core_api[] = { + UAPI_DEF_CHAIN(uverbs_def_obj_async_fd), UAPI_DEF_CHAIN(uverbs_def_obj_counters), UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), + UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), + UAPI_DEF_CHAIN(uverbs_def_obj_qp), + UAPI_DEF_CHAIN(uverbs_def_obj_srq), + UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; @@ -640,7 +652,7 @@ struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev) return ERR_PTR(-ENOMEM); INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); - uapi->driver_id = ibdev->driver_id; + uapi->driver_id = ibdev->ops.driver_id; rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false); if (rc) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ac011836bb54..11b1a194de44 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -50,8 +50,10 @@ #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> #include <rdma/rw.h> +#include <rdma/lag.h> #include "core_priv.h" +#include <trace/events/rdma_core.h> static int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr); @@ -94,10 +96,10 @@ static const char * const wc_statuses[] = { [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", [IB_WC_LOC_PROT_ERR] = "local protection error", [IB_WC_WR_FLUSH_ERR] = "WR flushed", - [IB_WC_MW_BIND_ERR] = "memory management operation error", + [IB_WC_MW_BIND_ERR] = "memory bind operation error", [IB_WC_BAD_RESP_ERR] = "bad response error", [IB_WC_LOC_ACCESS_ERR] = "local access error", - [IB_WC_REM_INV_REQ_ERR] = "invalid request error", + [IB_WC_REM_INV_REQ_ERR] = "remote invalid request error", [IB_WC_REM_ACCESS_ERR] = "remote access error", [IB_WC_REM_OP_ERR] = "remote operation error", [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", @@ -145,6 +147,8 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) case IB_RATE_50_GBPS: return 20; case IB_RATE_400_GBPS: return 160; case IB_RATE_600_GBPS: return 240; + case IB_RATE_800_GBPS: return 320; + case IB_RATE_1600_GBPS: return 640; default: return -1; } } @@ -174,6 +178,8 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) case 20: return IB_RATE_50_GBPS; case 160: return IB_RATE_400_GBPS; case 240: return IB_RATE_600_GBPS; + case 320: return IB_RATE_800_GBPS; + case 640: return IB_RATE_1600_GBPS; default: return IB_RATE_PORT_CURRENT; } } @@ -203,13 +209,15 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) case IB_RATE_50_GBPS: return 53125; case IB_RATE_400_GBPS: return 425000; case IB_RATE_600_GBPS: return 637500; + case IB_RATE_800_GBPS: return 850000; + case IB_RATE_1600_GBPS: return 1700000; default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) +rdma_node_get_transport(unsigned int node_type) { if (node_type == RDMA_NODE_USNIC) @@ -218,12 +226,15 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_USNIC_UDP; if (node_type == RDMA_NODE_RNIC) return RDMA_TRANSPORT_IWARP; + if (node_type == RDMA_NODE_UNSPECIFIED) + return RDMA_TRANSPORT_UNSPECIFIED; return RDMA_TRANSPORT_IB; } EXPORT_SYMBOL(rdma_node_get_transport); -enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, + u32 port_num) { enum rdma_transport_type lt; if (device->ops.get_link_layer) @@ -240,8 +251,10 @@ EXPORT_SYMBOL(rdma_port_get_link_layer); /* Protection domains */ /** - * ib_alloc_pd - Allocates an unused protection domain. + * __ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. + * @flags: protection domain flags + * @caller: caller's build-time module name * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. @@ -254,18 +267,27 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, { struct ib_pd *pd; int mr_access_flags = 0; + int ret; - pd = device->ops.alloc_pd(device, NULL, NULL); - if (IS_ERR(pd)) - return pd; + pd = rdma_zalloc_drv_obj(device, ib_pd); + if (!pd) + return ERR_PTR(-ENOMEM); pd->device = device; - pd->uobject = NULL; - pd->__internal_mr = NULL; - atomic_set(&pd->usecnt, 0); pd->flags = flags; - if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, caller); + + ret = device->ops.alloc_pd(pd, NULL); + if (ret) { + rdma_restrack_put(&pd->res); + kfree(pd); + return ERR_PTR(ret); + } + rdma_restrack_add(&pd->res); + + if (device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else mr_access_flags |= IB_ACCESS_LOCAL_WRITE; @@ -275,10 +297,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_set_task(&pd->res, caller); - rdma_restrack_kadd(&pd->res); - if (mr_access_flags) { struct ib_mr *mr; @@ -290,12 +308,13 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_DMA; mr->uobject = NULL; mr->need_inval = false; pd->__internal_mr = mr; - if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) + if (!(device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY)) pd->local_dma_lkey = pd->__internal_mr->lkey; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) @@ -307,34 +326,33 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, EXPORT_SYMBOL(__ib_alloc_pd); /** - * ib_dealloc_pd - Deallocates a protection domain. + * ib_dealloc_pd_user - Deallocates a protection domain. * @pd: The protection domain to deallocate. + * @udata: Valid user data or NULL for kernel object * * It is an error to call this function while any resources in the pd still * exist. The caller is responsible to synchronously destroy them and * guarantee no new allocations will happen. */ -void ib_dealloc_pd(struct ib_pd *pd) +int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) { int ret; if (pd->__internal_mr) { - ret = pd->device->ops.dereg_mr(pd->__internal_mr); + ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL); WARN_ON(ret); pd->__internal_mr = NULL; } - /* uverbs manipulates usecnt with proper locking, while the kabi - requires the caller to guarantee we can't race here. */ - WARN_ON(atomic_read(&pd->usecnt)); + ret = pd->device->ops.dealloc_pd(pd, udata); + if (ret) + return ret; rdma_restrack_del(&pd->res); - /* Making delalloc_pd a void return is a WIP, no driver should return - an error here. */ - ret = pd->device->ops.dealloc_pd(pd); - WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); + kfree(pd); + return ret; } -EXPORT_SYMBOL(ib_dealloc_pd); +EXPORT_SYMBOL(ib_dealloc_pd_user); /* Address handles */ @@ -354,7 +372,7 @@ void rdma_copy_ah_attr(struct rdma_ah_attr *dest, EXPORT_SYMBOL(rdma_copy_ah_attr); /** - * rdma_replace_ah_attr - Replace valid ah_attr with new new one. + * rdma_replace_ah_attr - Replace valid ah_attr with new one. * @old: Pointer to existing ah_attr which needs to be replaced. * old is assumed to be valid or zero'd * @new: Pointer to the new ah_attr. @@ -488,27 +506,45 @@ rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr, static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, - struct ib_udata *udata) + struct ib_udata *udata, + struct net_device *xmit_slave) { + struct rdma_ah_init_attr init_attr = {}; + struct ib_device *device = pd->device; struct ib_ah *ah; + int ret; might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE); - if (!pd->device->ops.create_ah) + if (!udata && !device->ops.create_ah) return ERR_PTR(-EOPNOTSUPP); - ah = pd->device->ops.create_ah(pd, ah_attr, flags, udata); + ah = rdma_zalloc_drv_obj_gfp( + device, ib_ah, + (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); - if (!IS_ERR(ah)) { - ah->device = pd->device; - ah->pd = pd; - ah->uobject = NULL; - ah->type = ah_attr->type; - ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); + ah->device = device; + ah->pd = pd; + ah->type = ah_attr->type; + ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); + init_attr.ah_attr = ah_attr; + init_attr.flags = flags; + init_attr.xmit_slave = xmit_slave; - atomic_inc(&pd->usecnt); + if (udata) + ret = device->ops.create_user_ah(ah, &init_attr, udata); + else + ret = device->ops.create_ah(ah, &init_attr, NULL); + if (ret) { + if (ah->sgid_attr) + rdma_put_gid_attr(ah->sgid_attr); + kfree(ah); + return ERR_PTR(ret); } + atomic_inc(&pd->usecnt); return ah; } @@ -527,15 +563,22 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags) { const struct ib_gid_attr *old_sgid_attr; + struct net_device *slave; struct ib_ah *ah; int ret; ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); if (ret) return ERR_PTR(ret); - - ah = _rdma_create_ah(pd, ah_attr, flags, NULL); - + slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr, + (flags & RDMA_CREATE_AH_SLEEPABLE) ? + GFP_KERNEL : GFP_ATOMIC); + if (IS_ERR(slave)) { + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return ERR_CAST(slave); + } + ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); + rdma_lag_put_ah_roce_slave(slave); rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ah; } @@ -574,7 +617,8 @@ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, } } - ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, udata); + ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, + udata, NULL); out: rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); @@ -616,7 +660,7 @@ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr) EXPORT_SYMBOL(ib_get_rdma_header_version); static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, - u8 port_num, + u32 port_num, const struct ib_grh *grh) { int grh_version; @@ -645,20 +689,21 @@ static bool find_gid_index(const union ib_gid *gid, void *context) { struct find_gid_index_context *ctx = context; + u16 vlan_id = 0xffff; + int ret; if (ctx->gid_type != gid_attr->gid_type) return false; - if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) || - (is_vlan_dev(gid_attr->ndev) && - vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id)) + ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL); + if (ret) return false; - return true; + return ctx->vlan_id == vlan_id; } static const struct ib_gid_attr * -get_sgid_attr_from_eth(struct ib_device *device, u8 port_num, +get_sgid_attr_from_eth(struct ib_device *device, u32 port_num, u16 vlan_id, const union ib_gid *sgid, enum ib_gid_type gid_type) { @@ -693,7 +738,7 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, (struct in6_addr *)dgid); return 0; } else if (net_type == RDMA_NETWORK_IPV6 || - net_type == RDMA_NETWORK_IB) { + net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) { *dgid = hdr->ibgrh.dgid; *sgid = hdr->ibgrh.sgid; return 0; @@ -705,7 +750,7 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); /* Resolve destination mac address and hop limit for unicast destination * GID entry, considering the source GID entry as well. - * ah_attribute must have have valid port_num, sgid_index. + * ah_attribute must have valid port_num, sgid_index. */ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr) @@ -745,7 +790,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, * On success the caller is responsible to call rdma_destroy_ah_attr on the * attr. */ -int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, +int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr) { @@ -876,7 +921,7 @@ void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr) EXPORT_SYMBOL(rdma_destroy_ah_attr); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, - const struct ib_grh *grh, u8 port_num) + const struct ib_grh *grh, u32 port_num) { struct rdma_ah_attr ah_attr; struct ib_ah *ah; @@ -925,7 +970,7 @@ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) } EXPORT_SYMBOL(rdma_query_ah); -int rdma_destroy_ah(struct ib_ah *ah, u32 flags) +int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata) { const struct ib_gid_attr *sgid_attr = ah->sgid_attr; struct ib_pd *pd; @@ -934,51 +979,87 @@ int rdma_destroy_ah(struct ib_ah *ah, u32 flags) might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE); pd = ah->pd; + ret = ah->device->ops.destroy_ah(ah, flags); - if (!ret) { - atomic_dec(&pd->usecnt); - if (sgid_attr) - rdma_put_gid_attr(sgid_attr); - } + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + if (sgid_attr) + rdma_put_gid_attr(sgid_attr); + kfree(ah); return ret; } -EXPORT_SYMBOL(rdma_destroy_ah); +EXPORT_SYMBOL(rdma_destroy_ah_user); /* Shared receive queues */ -struct ib_srq *ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *srq_init_attr) +/** + * ib_create_srq_user - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the + * SRQ. If SRQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created SRQ. + * @uobject: uobject pointer if this is not a kernel SRQ + * @udata: udata pointer if this is not a kernel SRQ + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ib_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ib_srq *ib_create_srq_user(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_usrq_object *uobject, + struct ib_udata *udata) { struct ib_srq *srq; + int ret; - if (!pd->device->ops.create_srq) - return ERR_PTR(-EOPNOTSUPP); + srq = rdma_zalloc_drv_obj(pd->device, ib_srq); + if (!srq) + return ERR_PTR(-ENOMEM); - srq = pd->device->ops.create_srq(pd, srq_init_attr, NULL); - - if (!IS_ERR(srq)) { - srq->device = pd->device; - srq->pd = pd; - srq->uobject = NULL; - srq->event_handler = srq_init_attr->event_handler; - srq->srq_context = srq_init_attr->srq_context; - srq->srq_type = srq_init_attr->srq_type; - if (ib_srq_has_cq(srq->srq_type)) { - srq->ext.cq = srq_init_attr->ext.cq; - atomic_inc(&srq->ext.cq->usecnt); - } - if (srq->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + srq->device = pd->device; + srq->pd = pd; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + srq->srq_type = srq_init_attr->srq_type; + srq->uobject = uobject; + + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + if (srq->ext.xrc.xrcd) atomic_inc(&srq->ext.xrc.xrcd->usecnt); - } - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); } + atomic_inc(&pd->usecnt); + + rdma_restrack_new(&srq->res, RDMA_RESTRACK_SRQ); + rdma_restrack_parent_name(&srq->res, &pd->res); + + ret = pd->device->ops.create_srq(srq, srq_init_attr, udata); + if (ret) { + rdma_restrack_put(&srq->res); + atomic_dec(&pd->usecnt); + if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) + atomic_dec(&srq->ext.xrc.xrcd->usecnt); + if (ib_srq_has_cq(srq->srq_type)) + atomic_dec(&srq->ext.cq->usecnt); + kfree(srq); + return ERR_PTR(ret); + } + + rdma_restrack_add(&srq->res); return srq; } -EXPORT_SYMBOL(ib_create_srq); +EXPORT_SYMBOL(ib_create_srq_user); int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, @@ -998,56 +1079,51 @@ int ib_query_srq(struct ib_srq *srq, } EXPORT_SYMBOL(ib_query_srq); -int ib_destroy_srq(struct ib_srq *srq) +int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) { - struct ib_pd *pd; - enum ib_srq_type srq_type; - struct ib_xrcd *uninitialized_var(xrcd); - struct ib_cq *uninitialized_var(cq); int ret; if (atomic_read(&srq->usecnt)) return -EBUSY; - pd = srq->pd; - srq_type = srq->srq_type; - if (ib_srq_has_cq(srq_type)) - cq = srq->ext.cq; - if (srq_type == IB_SRQT_XRC) - xrcd = srq->ext.xrc.xrcd; + ret = srq->device->ops.destroy_srq(srq, udata); + if (ret) + return ret; - ret = srq->device->ops.destroy_srq(srq); - if (!ret) { - atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) - atomic_dec(&xrcd->usecnt); - if (ib_srq_has_cq(srq_type)) - atomic_dec(&cq->usecnt); - } + atomic_dec(&srq->pd->usecnt); + if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) + atomic_dec(&srq->ext.xrc.xrcd->usecnt); + if (ib_srq_has_cq(srq->srq_type)) + atomic_dec(&srq->ext.cq->usecnt); + rdma_restrack_del(&srq->res); + kfree(srq); return ret; } -EXPORT_SYMBOL(ib_destroy_srq); +EXPORT_SYMBOL(ib_destroy_srq_user); /* Queue pairs */ +static void __ib_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = event->element.qp; + + if (event->event == IB_EVENT_QP_LAST_WQE_REACHED) + complete(&qp->srq_completion); + if (qp->registered_event_handler) + qp->registered_event_handler(event, qp->qp_context); +} + static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = context; unsigned long flags; - spin_lock_irqsave(&qp->device->event_handler_lock, flags); + spin_lock_irqsave(&qp->device->qp_open_list_lock, flags); list_for_each_entry(event->element.qp, &qp->open_list, open_list) if (event->element.qp->event_handler) event->element.qp->event_handler(event, event->element.qp->qp_context); - spin_unlock_irqrestore(&qp->device->event_handler_lock, flags); -} - -static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp) -{ - mutex_lock(&xrcd->tgt_qp_mutex); - list_add(&qp->xrcd_list, &xrcd->tgt_qp_list); - mutex_unlock(&xrcd->tgt_qp_mutex); + spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags); } static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, @@ -1077,9 +1153,9 @@ static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, qp->qp_num = real_qp->qp_num; qp->qp_type = real_qp->qp_type; - spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); list_add(&qp->open_list, &real_qp->open_list); - spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); return qp; } @@ -1092,24 +1168,24 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) return ERR_PTR(-EINVAL); - qp = ERR_PTR(-EINVAL); - mutex_lock(&xrcd->tgt_qp_mutex); - list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) { - if (real_qp->qp_num == qp_open_attr->qp_num) { - qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, - qp_open_attr->qp_context); - break; - } + down_read(&xrcd->tgt_qps_rwsem); + real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num); + if (!real_qp) { + up_read(&xrcd->tgt_qps_rwsem); + return ERR_PTR(-EINVAL); } - mutex_unlock(&xrcd->tgt_qp_mutex); + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + up_read(&xrcd->tgt_qps_rwsem); return qp; } EXPORT_SYMBOL(ib_open_qp); -static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, - struct ib_qp_init_attr *qp_init_attr) +static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *real_qp = qp; + int err; qp->event_handler = __ib_shared_qp_event_handler; qp->qp_context = qp; @@ -1122,89 +1198,179 @@ static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->ops.destroy_qp(real_qp); + if (IS_ERR(qp)) + return qp; + + err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num, + real_qp, GFP_KERNEL)); + if (err) { + ib_close_qp(qp); + return ERR_PTR(err); + } return qp; } -struct ib_qp *ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *qp_init_attr) +static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller) { - struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_udata dummy = {}; struct ib_qp *qp; int ret; - if (qp_init_attr->rwq_ind_tbl && - (qp_init_attr->recv_cq || - qp_init_attr->srq || qp_init_attr->cap.max_recv_wr || - qp_init_attr->cap.max_recv_sge)) - return ERR_PTR(-EINVAL); + if (!dev->ops.create_qp) + return ERR_PTR(-EOPNOTSUPP); + + qp = rdma_zalloc_drv_obj_numa(dev, ib_qp); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->device = dev; + qp->pd = pd; + qp->uobject = uobj; + qp->real_qp = qp; + + qp->qp_type = attr->qp_type; + qp->rwq_ind_tbl = attr->rwq_ind_tbl; + qp->srq = attr->srq; + qp->event_handler = __ib_qp_event_handler; + qp->registered_event_handler = attr->event_handler; + qp->port = attr->port_num; + qp->qp_context = attr->qp_context; + + spin_lock_init(&qp->mr_lock); + INIT_LIST_HEAD(&qp->rdma_mrs); + INIT_LIST_HEAD(&qp->sig_mrs); + init_completion(&qp->srq_completion); + + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + + rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); + WARN_ONCE(!udata && !caller, "Missing kernel QP owner"); + rdma_restrack_set_name(&qp->res, udata ? NULL : caller); + ret = dev->ops.create_qp(qp, attr, udata); + if (ret) + goto err_create; /* - * If the callers is using the RDMA API calculate the resources - * needed for the RDMA READ/WRITE operations. - * - * Note that these callers need to pass in a port number. + * TODO: The mlx4 internally overwrites send_cq and recv_cq. + * Unfortunately, it is not an easy task to fix that driver. */ - if (qp_init_attr->cap.max_rdma_ctxs) - rdma_rw_init_qp(device, qp_init_attr); + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; - qp = _ib_create_qp(device, pd, qp_init_attr, NULL, NULL); - if (IS_ERR(qp)) + ret = ib_create_qp_security(qp, dev); + if (ret) + goto err_security; + + rdma_restrack_add(&qp->res); + return qp; + +err_security: + qp->device->ops.destroy_qp(qp, udata ? &dummy : NULL); +err_create: + rdma_restrack_put(&qp->res); + kfree(qp); + return ERR_PTR(ret); + +} + +/** + * ib_create_qp_user - Creates a QP associated with the specified protection + * domain. + * @dev: IB device + * @pd: The protection domain associated with the QP. + * @attr: A list of initial attributes required to create the + * QP. If QP creation succeeds, then the attributes are updated to + * the actual capabilities of the created QP. + * @udata: User data + * @uobj: uverbs obect + * @caller: caller's build-time module name + */ +struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller) +{ + struct ib_qp *qp, *xrc_qp; + + if (attr->qp_type == IB_QPT_XRC_TGT) + qp = create_qp(dev, pd, attr, NULL, NULL, caller); + else + qp = create_qp(dev, pd, attr, udata, uobj, NULL); + if (attr->qp_type != IB_QPT_XRC_TGT || IS_ERR(qp)) return qp; - ret = ib_create_qp_security(qp, device); - if (ret) { + xrc_qp = create_xrc_qp_user(qp, attr); + if (IS_ERR(xrc_qp)) { ib_destroy_qp(qp); - return ERR_PTR(ret); + return xrc_qp; } - qp->real_qp = qp; - qp->qp_type = qp_init_attr->qp_type; - qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; + xrc_qp->uobject = uobj; + return xrc_qp; +} +EXPORT_SYMBOL(ib_create_qp_user); - atomic_set(&qp->usecnt, 0); - qp->mrs_used = 0; - spin_lock_init(&qp->mr_lock); - INIT_LIST_HEAD(&qp->rdma_mrs); - INIT_LIST_HEAD(&qp->sig_mrs); - qp->port = 0; +void ib_qp_usecnt_inc(struct ib_qp *qp) +{ + if (qp->pd) + atomic_inc(&qp->pd->usecnt); + if (qp->send_cq) + atomic_inc(&qp->send_cq->usecnt); + if (qp->recv_cq) + atomic_inc(&qp->recv_cq->usecnt); + if (qp->srq) + atomic_inc(&qp->srq->usecnt); + if (qp->rwq_ind_tbl) + atomic_inc(&qp->rwq_ind_tbl->usecnt); +} +EXPORT_SYMBOL(ib_qp_usecnt_inc); - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) - return ib_create_xrc_qp(qp, qp_init_attr); +void ib_qp_usecnt_dec(struct ib_qp *qp) +{ + if (qp->rwq_ind_tbl) + atomic_dec(&qp->rwq_ind_tbl->usecnt); + if (qp->srq) + atomic_dec(&qp->srq->usecnt); + if (qp->recv_cq) + atomic_dec(&qp->recv_cq->usecnt); + if (qp->send_cq) + atomic_dec(&qp->send_cq->usecnt); + if (qp->pd) + atomic_dec(&qp->pd->usecnt); +} +EXPORT_SYMBOL(ib_qp_usecnt_dec); - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - if (qp_init_attr->recv_cq) - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } +struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + const char *caller) +{ + struct ib_device *device = pd->device; + struct ib_qp *qp; + int ret; - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; + /* + * If the callers is using the RDMA API calculate the resources + * needed for the RDMA READ/WRITE operations. + * + * Note that these callers need to pass in a port number. + */ + if (qp_init_attr->cap.max_rdma_ctxs) + rdma_rw_init_qp(device, qp_init_attr); - atomic_inc(&pd->usecnt); - if (qp_init_attr->send_cq) - atomic_inc(&qp_init_attr->send_cq->usecnt); - if (qp_init_attr->rwq_ind_tbl) - atomic_inc(&qp->rwq_ind_tbl->usecnt); + qp = create_qp(device, pd, qp_init_attr, NULL, NULL, caller); + if (IS_ERR(qp)) + return qp; + + ib_qp_usecnt_inc(qp); if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); - if (ret) { - pr_err("failed to init MR pool ret= %d\n", ret); - ib_destroy_qp(qp); - return ERR_PTR(ret); - } + if (ret) + goto err; } /* @@ -1215,10 +1381,17 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->max_write_sge = qp_init_attr->cap.max_send_sge; qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, device->attrs.max_sge_rd); + if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) + qp->integrity_en = true; return qp; + +err: + ib_destroy_qp(qp); + return ERR_PTR(ret); + } -EXPORT_SYMBOL(ib_create_qp); +EXPORT_SYMBOL(ib_create_qp_kernel); static const struct { int valid; @@ -1591,22 +1764,48 @@ static bool is_qp_type_connected(const struct ib_qp *qp) qp->qp_type == IB_QPT_XRC_TGT); } -/** +/* * IB core internal function to perform QP attributes modification. */ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + u32 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; const struct ib_gid_attr *old_sgid_attr_av; const struct ib_gid_attr *old_sgid_attr_alt_av; int ret; + attr->xmit_slave = NULL; if (attr_mask & IB_QP_AV) { ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr, &old_sgid_attr_av); if (ret) return ret; + + if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { + struct net_device *slave; + + /* + * If the user provided the qp_attr then we have to + * resolve it. Kerne users have to provide already + * resolved rdma_ah_attr's. + */ + if (udata) { + ret = ib_resolve_eth_dmac(qp->device, + &attr->ah_attr); + if (ret) + goto out_av; + } + slave = rdma_lag_get_ah_roce_slave(qp->device, + &attr->ah_attr, + GFP_KERNEL); + if (IS_ERR(slave)) { + ret = PTR_ERR(slave); + goto out_av; + } + attr->xmit_slave = slave; + } } if (attr_mask & IB_QP_ALT_PATH) { /* @@ -1628,23 +1827,11 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, if (!(rdma_protocol_ib(qp->device, attr->alt_ah_attr.port_num) && rdma_protocol_ib(qp->device, port))) { - ret = EINVAL; + ret = -EINVAL; goto out; } } - /* - * If the user provided the qp_attr then we have to resolve it. Kernel - * users have to provide already resolved rdma_ah_attr's - */ - if (udata && (attr_mask & IB_QP_AV) && - attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && - is_qp_type_connected(qp)) { - ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); - if (ret) - goto out; - } - if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { dev_warn(&qp->device->dev, @@ -1661,6 +1848,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, } } + /* + * Bind this qp to a counter automatically based on the rdma counter + * rules. This only set in RST2INIT with port specified + */ + if (!qp->counter && (attr_mask & IB_QP_PORT) && + ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT)) + rdma_counter_bind_qp_auto(qp, attr->port_num); + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); if (ret) goto out; @@ -1678,8 +1873,10 @@ out: if (attr_mask & IB_QP_ALT_PATH) rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av); out_av: - if (attr_mask & IB_QP_AV) + if (attr_mask & IB_QP_AV) { + rdma_lag_put_ah_roce_slave(attr->xmit_slave); rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av); + } return ret; } @@ -1701,20 +1898,100 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, } EXPORT_SYMBOL(ib_modify_qp_with_udata); -int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +static void ib_get_width_and_speed(u32 netdev_speed, u32 lanes, + u16 *speed, u8 *width) +{ + if (!lanes) { + if (netdev_speed <= SPEED_1000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_SDR; + } else if (netdev_speed <= SPEED_10000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_20000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_DDR; + } else if (netdev_speed <= SPEED_25000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_40000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_50000) { + *width = IB_WIDTH_2X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_100000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_200000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_HDR; + } else { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_NDR; + } + + return; + } + + switch (lanes) { + case 1: + *width = IB_WIDTH_1X; + break; + case 2: + *width = IB_WIDTH_2X; + break; + case 4: + *width = IB_WIDTH_4X; + break; + case 8: + *width = IB_WIDTH_8X; + break; + case 12: + *width = IB_WIDTH_12X; + break; + default: + *width = IB_WIDTH_1X; + } + + switch (netdev_speed / lanes) { + case SPEED_2500: + *speed = IB_SPEED_SDR; + break; + case SPEED_5000: + *speed = IB_SPEED_DDR; + break; + case SPEED_10000: + *speed = IB_SPEED_FDR10; + break; + case SPEED_14000: + *speed = IB_SPEED_FDR; + break; + case SPEED_25000: + *speed = IB_SPEED_EDR; + break; + case SPEED_50000: + *speed = IB_SPEED_HDR; + break; + case SPEED_100000: + *speed = IB_SPEED_NDR; + break; + default: + *speed = IB_SPEED_SDR; + } +} + +int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width) { int rc; u32 netdev_speed; struct net_device *netdev; - struct ethtool_link_ksettings lksettings; + struct ethtool_link_ksettings lksettings = {}; if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) return -EINVAL; - if (!dev->ops.get_netdev) - return -EOPNOTSUPP; - - netdev = dev->ops.get_netdev(dev, port_num); + netdev = ib_device_get_netdev(dev, port_num); if (!netdev) return -ENODEV; @@ -1724,33 +2001,17 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) dev_put(netdev); - if (!rc) { + if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) { netdev_speed = lksettings.base.speed; } else { netdev_speed = SPEED_1000; - pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name, - netdev_speed); + if (rc) + pr_warn("%s speed is unknown, defaulting to %u\n", + netdev->name, netdev_speed); } - if (netdev_speed <= SPEED_1000) { - *width = IB_WIDTH_1X; - *speed = IB_SPEED_SDR; - } else if (netdev_speed <= SPEED_10000) { - *width = IB_WIDTH_1X; - *speed = IB_SPEED_FDR10; - } else if (netdev_speed <= SPEED_20000) { - *width = IB_WIDTH_4X; - *speed = IB_SPEED_DDR; - } else if (netdev_speed <= SPEED_25000) { - *width = IB_WIDTH_1X; - *speed = IB_SPEED_EDR; - } else if (netdev_speed <= SPEED_40000) { - *width = IB_WIDTH_4X; - *speed = IB_SPEED_FDR10; - } else { - *width = IB_WIDTH_4X; - *speed = IB_SPEED_EDR; - } + ib_get_width_and_speed(netdev_speed, lksettings.lanes, + speed, width); return 0; } @@ -1787,9 +2048,9 @@ int ib_close_qp(struct ib_qp *qp) if (real_qp == qp) return -EINVAL; - spin_lock_irqsave(&real_qp->device->event_handler_lock, flags); + spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); list_del(&qp->open_list); - spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); + spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); atomic_dec(&real_qp->usecnt); if (qp->qp_sec) @@ -1808,34 +2069,27 @@ static int __ib_destroy_shared_qp(struct ib_qp *qp) real_qp = qp->real_qp; xrcd = real_qp->xrcd; - - mutex_lock(&xrcd->tgt_qp_mutex); + down_write(&xrcd->tgt_qps_rwsem); ib_close_qp(qp); if (atomic_read(&real_qp->usecnt) == 0) - list_del(&real_qp->xrcd_list); + xa_erase(&xrcd->tgt_qps, real_qp->qp_num); else real_qp = NULL; - mutex_unlock(&xrcd->tgt_qp_mutex); + up_write(&xrcd->tgt_qps_rwsem); if (real_qp) { ret = ib_destroy_qp(real_qp); if (!ret) atomic_dec(&xrcd->usecnt); - else - __ib_insert_xrcd_qp(xrcd, real_qp); } return 0; } -int ib_destroy_qp(struct ib_qp *qp) +int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) { const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr; const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; - struct ib_pd *pd; - struct ib_cq *scq, *rcq; - struct ib_srq *srq; - struct ib_rwq_ind_table *ind_tbl; struct ib_qp_security *sec; int ret; @@ -1847,11 +2101,6 @@ int ib_destroy_qp(struct ib_qp *qp) if (qp->real_qp != qp) return __ib_destroy_shared_qp(qp); - pd = qp->pd; - scq = qp->send_cq; - rcq = qp->recv_cq; - srq = qp->srq; - ind_tbl = qp->rwq_ind_tbl; sec = qp->qp_sec; if (sec) ib_destroy_qp_security_begin(sec); @@ -1859,33 +2108,28 @@ int ib_destroy_qp(struct ib_qp *qp) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); - rdma_restrack_del(&qp->res); - ret = qp->device->ops.destroy_qp(qp); - if (!ret) { - if (alt_path_sgid_attr) - rdma_put_gid_attr(alt_path_sgid_attr); - if (av_sgid_attr) - rdma_put_gid_attr(av_sgid_attr); - if (pd) - atomic_dec(&pd->usecnt); - if (scq) - atomic_dec(&scq->usecnt); - if (rcq) - atomic_dec(&rcq->usecnt); - if (srq) - atomic_dec(&srq->usecnt); - if (ind_tbl) - atomic_dec(&ind_tbl->usecnt); - if (sec) - ib_destroy_qp_security_end(sec); - } else { + rdma_counter_unbind_qp(qp, qp->port, true); + ret = qp->device->ops.destroy_qp(qp, udata); + if (ret) { if (sec) ib_destroy_qp_security_abort(sec); + return ret; } + if (alt_path_sgid_attr) + rdma_put_gid_attr(alt_path_sgid_attr); + if (av_sgid_attr) + rdma_put_gid_attr(av_sgid_attr); + + ib_qp_usecnt_dec(qp); + if (sec) + ib_destroy_qp_security_end(sec); + + rdma_restrack_del(&qp->res); + kfree(qp); return ret; } -EXPORT_SYMBOL(ib_destroy_qp); +EXPORT_SYMBOL(ib_destroy_qp_user); /* Completion queues */ @@ -1897,45 +2141,70 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, const char *caller) { struct ib_cq *cq; + int ret; + + cq = rdma_zalloc_drv_obj(device, ib_cq); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); - cq = device->ops.create_cq(device, cq_attr, NULL, NULL); - - if (!IS_ERR(cq)) { - cq->device = device; - cq->uobject = NULL; - cq->comp_handler = comp_handler; - cq->event_handler = event_handler; - cq->cq_context = cq_context; - atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); - rdma_restrack_kadd(&cq->res); + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); + + ret = device->ops.create_cq(cq, cq_attr, NULL); + if (ret) { + rdma_restrack_put(&cq->res); + kfree(cq); + return ERR_PTR(ret); } + rdma_restrack_add(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) { + if (cq->shared) + return -EOPNOTSUPP; + return cq->device->ops.modify_cq ? cq->device->ops.modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_set_cq_moderation); -int ib_destroy_cq(struct ib_cq *cq) +int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) { + int ret; + + if (WARN_ON_ONCE(cq->shared)) + return -EOPNOTSUPP; + if (atomic_read(&cq->usecnt)) return -EBUSY; + ret = cq->device->ops.destroy_cq(cq, udata); + if (ret) + return ret; + rdma_restrack_del(&cq->res); - return cq->device->ops.destroy_cq(cq); + kfree(cq); + return ret; } -EXPORT_SYMBOL(ib_destroy_cq); +EXPORT_SYMBOL(ib_destroy_cq_user); int ib_resize_cq(struct ib_cq *cq, int cqe) { + if (cq->shared) + return -EOPNOTSUPP; + return cq->device->ops.resize_cq ? cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP; } @@ -1943,23 +2212,78 @@ EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ -int ib_dereg_mr(struct ib_mr *mr) +struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags) +{ + struct ib_mr *mr; + + if (access_flags & IB_ACCESS_ON_DEMAND) { + if (!(pd->device->attrs.kernel_cap_flags & + IBK_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + return ERR_PTR(-EINVAL); + } + } + + mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, + access_flags, NULL, NULL); + + if (IS_ERR(mr)) + return mr; + + mr->device = pd->device; + mr->type = IB_MR_TYPE_USER; + mr->pd = pd; + mr->dm = NULL; + atomic_inc(&pd->usecnt); + mr->iova = virt_addr; + mr->length = length; + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); + + return mr; +} +EXPORT_SYMBOL(ib_reg_user_mr); + +int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge) +{ + if (!pd->device->ops.advise_mr) + return -EOPNOTSUPP; + + if (!num_sge) + return 0; + + return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge, + NULL); +} +EXPORT_SYMBOL(ib_advise_mr); + +int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; + struct ib_dmah *dmah = mr->dmah; + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; + trace_mr_dereg(mr); rdma_restrack_del(&mr->res); - ret = mr->device->ops.dereg_mr(mr); + ret = mr->device->ops.dereg_mr(mr, udata); if (!ret) { atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); + if (dmah) + atomic_dec(&dmah->usecnt); + kfree(sig_attrs); } return ret; } -EXPORT_SYMBOL(ib_dereg_mr); +EXPORT_SYMBOL(ib_dereg_mr_user); /** * ib_alloc_mr() - Allocates a memory region @@ -1973,78 +2297,104 @@ EXPORT_SYMBOL(ib_dereg_mr); * max_num_sg * used_page_size. * */ -struct ib_mr *ib_alloc_mr(struct ib_pd *pd, - enum ib_mr_type mr_type, +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) { struct ib_mr *mr; - if (!pd->device->ops.alloc_mr) - return ERR_PTR(-EOPNOTSUPP); + if (!pd->device->ops.alloc_mr) { + mr = ERR_PTR(-EOPNOTSUPP); + goto out; + } - mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->dm = NULL; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - mr->need_inval = false; - mr->res.type = RDMA_RESTRACK_MR; - rdma_restrack_kadd(&mr->res); + if (mr_type == IB_MR_TYPE_INTEGRITY) { + WARN_ON_ONCE(1); + mr = ERR_PTR(-EINVAL); + goto out; } + mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); + if (IS_ERR(mr)) + goto out; + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + mr->type = mr_type; + mr->sig_attrs = NULL; + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); +out: + trace_mr_alloc(pd, mr_type, max_num_sg, mr); return mr; } EXPORT_SYMBOL(ib_alloc_mr); -/* "Fast" memory regions */ - -struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, - int mr_access_flags, - struct ib_fmr_attr *fmr_attr) +/** + * ib_alloc_mr_integrity() - Allocates an integrity memory region + * @pd: protection domain associated with the region + * @max_num_data_sg: maximum data sg entries available for registration + * @max_num_meta_sg: maximum metadata sg entries available for + * registration + * + * Notes: + * Memory registration page/sg lists must not exceed max_num_sg, + * also the integrity page/sg lists must not exceed max_num_meta_sg. + * + */ +struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_data_sg, + u32 max_num_meta_sg) { - struct ib_fmr *fmr; - - if (!pd->device->ops.alloc_fmr) - return ERR_PTR(-EOPNOTSUPP); + struct ib_mr *mr; + struct ib_sig_attrs *sig_attrs; - fmr = pd->device->ops.alloc_fmr(pd, mr_access_flags, fmr_attr); - if (!IS_ERR(fmr)) { - fmr->device = pd->device; - fmr->pd = pd; - atomic_inc(&pd->usecnt); + if (!pd->device->ops.alloc_mr_integrity || + !pd->device->ops.map_mr_sg_pi) { + mr = ERR_PTR(-EOPNOTSUPP); + goto out; } - return fmr; -} -EXPORT_SYMBOL(ib_alloc_fmr); - -int ib_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *fmr; - - if (list_empty(fmr_list)) - return 0; + if (!max_num_meta_sg) { + mr = ERR_PTR(-EINVAL); + goto out; + } - fmr = list_entry(fmr_list->next, struct ib_fmr, list); - return fmr->device->ops.unmap_fmr(fmr_list); -} -EXPORT_SYMBOL(ib_unmap_fmr); + sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL); + if (!sig_attrs) { + mr = ERR_PTR(-ENOMEM); + goto out; + } -int ib_dealloc_fmr(struct ib_fmr *fmr) -{ - struct ib_pd *pd; - int ret; + mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg, + max_num_meta_sg); + if (IS_ERR(mr)) { + kfree(sig_attrs); + goto out; + } - pd = fmr->pd; - ret = fmr->device->ops.dealloc_fmr(fmr); - if (!ret) - atomic_dec(&pd->usecnt); + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + mr->type = IB_MR_TYPE_INTEGRITY; + mr->sig_attrs = sig_attrs; - return ret; + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); +out: + trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr); + return mr; } -EXPORT_SYMBOL(ib_dealloc_fmr); +EXPORT_SYMBOL(ib_alloc_mr_integrity); /* Multicast groups */ @@ -2053,7 +2403,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) struct ib_qp_init_attr init_attr = {}; struct ib_qp_attr attr = {}; int num_eth_ports = 0; - int port; + unsigned int port; /* If QP state >= init, it is assigned to a port and we can check this * port only. @@ -2068,7 +2418,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) } /* Can't get a quick answer, iterate over all ports */ - for (port = 0; port < qp->device->phys_port_cnt; port++) + rdma_for_each_port(qp->device, port) if (rdma_port_get_link_layer(qp->device, port) != IB_LINK_LAYER_INFINIBAND) num_eth_ports++; @@ -2122,44 +2472,61 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) } EXPORT_SYMBOL(ib_detach_mcast); -struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) +/** + * ib_alloc_xrcd_user - Allocates an XRC domain. + * @device: The device on which to allocate the XRC domain. + * @inode: inode to connect XRCD + * @udata: Valid user data or NULL for kernel object + */ +struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, + struct inode *inode, struct ib_udata *udata) { struct ib_xrcd *xrcd; + int ret; if (!device->ops.alloc_xrcd) return ERR_PTR(-EOPNOTSUPP); - xrcd = device->ops.alloc_xrcd(device, NULL, NULL); - if (!IS_ERR(xrcd)) { - xrcd->device = device; - xrcd->inode = NULL; - atomic_set(&xrcd->usecnt, 0); - mutex_init(&xrcd->tgt_qp_mutex); - INIT_LIST_HEAD(&xrcd->tgt_qp_list); - } + xrcd = rdma_zalloc_drv_obj(device, ib_xrcd); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + xrcd->device = device; + xrcd->inode = inode; + atomic_set(&xrcd->usecnt, 0); + init_rwsem(&xrcd->tgt_qps_rwsem); + xa_init(&xrcd->tgt_qps); + ret = device->ops.alloc_xrcd(xrcd, udata); + if (ret) + goto err; return xrcd; +err: + kfree(xrcd); + return ERR_PTR(ret); } -EXPORT_SYMBOL(__ib_alloc_xrcd); +EXPORT_SYMBOL(ib_alloc_xrcd_user); -int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +/** + * ib_dealloc_xrcd_user - Deallocates an XRC domain. + * @xrcd: The XRC domain to deallocate. + * @udata: Valid user data or NULL for kernel object + */ +int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata) { - struct ib_qp *qp; int ret; if (atomic_read(&xrcd->usecnt)) return -EBUSY; - while (!list_empty(&xrcd->tgt_qp_list)) { - qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list); - ret = ib_destroy_qp(qp); - if (ret) - return ret; - } - - return xrcd->device->ops.dealloc_xrcd(xrcd); + WARN_ON(!xa_empty(&xrcd->tgt_qps)); + ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata); + if (ret) + return ret; + kfree(xrcd); + return ret; } -EXPORT_SYMBOL(ib_dealloc_xrcd); +EXPORT_SYMBOL(ib_dealloc_xrcd_user); /** * ib_create_wq - Creates a WQ associated with the specified protection @@ -2201,109 +2568,28 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd, EXPORT_SYMBOL(ib_create_wq); /** - * ib_destroy_wq - Destroys the specified WQ. + * ib_destroy_wq_user - Destroys the specified user WQ. * @wq: The WQ to destroy. + * @udata: Valid user data */ -int ib_destroy_wq(struct ib_wq *wq) +int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata) { - int err; struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; + int ret; if (atomic_read(&wq->usecnt)) return -EBUSY; - err = wq->device->ops.destroy_wq(wq); - if (!err) { - atomic_dec(&pd->usecnt); - atomic_dec(&cq->usecnt); - } - return err; -} -EXPORT_SYMBOL(ib_destroy_wq); - -/** - * ib_modify_wq - Modifies the specified WQ. - * @wq: The WQ to modify. - * @wq_attr: On input, specifies the WQ attributes to modify. - * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ - * are being modified. - * On output, the current values of selected WQ attributes are returned. - */ -int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, - u32 wq_attr_mask) -{ - int err; - - if (!wq->device->ops.modify_wq) - return -EOPNOTSUPP; - - err = wq->device->ops.modify_wq(wq, wq_attr, wq_attr_mask, NULL); - return err; -} -EXPORT_SYMBOL(ib_modify_wq); - -/* - * ib_create_rwq_ind_table - Creates a RQ Indirection Table. - * @device: The device on which to create the rwq indirection table. - * @ib_rwq_ind_table_init_attr: A list of initial attributes required to - * create the Indirection Table. - * - * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less - * than the created ib_rwq_ind_table object and the caller is responsible - * for its memory allocation/free. - */ -struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, - struct ib_rwq_ind_table_init_attr *init_attr) -{ - struct ib_rwq_ind_table *rwq_ind_table; - int i; - u32 table_size; - - if (!device->ops.create_rwq_ind_table) - return ERR_PTR(-EOPNOTSUPP); - - table_size = (1 << init_attr->log_ind_tbl_size); - rwq_ind_table = device->ops.create_rwq_ind_table(device, - init_attr, NULL); - if (IS_ERR(rwq_ind_table)) - return rwq_ind_table; - - rwq_ind_table->ind_tbl = init_attr->ind_tbl; - rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size; - rwq_ind_table->device = device; - rwq_ind_table->uobject = NULL; - atomic_set(&rwq_ind_table->usecnt, 0); - - for (i = 0; i < table_size; i++) - atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt); - - return rwq_ind_table; -} -EXPORT_SYMBOL(ib_create_rwq_ind_table); - -/* - * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table. - * @wq_ind_table: The Indirection Table to destroy. -*/ -int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) -{ - int err, i; - u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); - struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; - - if (atomic_read(&rwq_ind_table->usecnt)) - return -EBUSY; - - err = rwq_ind_table->device->ops.destroy_rwq_ind_table(rwq_ind_table); - if (!err) { - for (i = 0; i < table_size; i++) - atomic_dec(&ind_tbl[i]->usecnt); - } + ret = wq->device->ops.destroy_wq(wq, udata); + if (ret) + return ret; - return err; + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + return ret; } -EXPORT_SYMBOL(ib_destroy_rwq_ind_table); +EXPORT_SYMBOL(ib_destroy_wq_user); int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) @@ -2315,7 +2601,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, } EXPORT_SYMBOL(ib_check_mr_status); -int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, +int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state) { if (!device->ops.set_vf_link_state) @@ -2325,7 +2611,7 @@ int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, } EXPORT_SYMBOL(ib_set_vf_link_state); -int ib_get_vf_config(struct ib_device *device, int vf, u8 port, +int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info) { if (!device->ops.get_vf_config) @@ -2335,7 +2621,7 @@ int ib_get_vf_config(struct ib_device *device, int vf, u8 port, } EXPORT_SYMBOL(ib_get_vf_config); -int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, +int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats) { if (!device->ops.get_vf_stats) @@ -2345,7 +2631,7 @@ int ib_get_vf_stats(struct ib_device *device, int vf, u8 port, } EXPORT_SYMBOL(ib_get_vf_stats); -int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, +int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type) { if (!device->ops.set_vf_guid) @@ -2355,6 +2641,53 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, } EXPORT_SYMBOL(ib_set_vf_guid); +int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, + struct ifla_vf_guid *node_guid, + struct ifla_vf_guid *port_guid) +{ + if (!device->ops.get_vf_guid) + return -EOPNOTSUPP; + + return device->ops.get_vf_guid(device, vf, port, node_guid, port_guid); +} +EXPORT_SYMBOL(ib_get_vf_guid); +/** + * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection + * information) and set an appropriate memory region for registration. + * @mr: memory region + * @data_sg: dma mapped scatterlist for data + * @data_sg_nents: number of entries in data_sg + * @data_sg_offset: offset in bytes into data_sg + * @meta_sg: dma mapped scatterlist for metadata + * @meta_sg_nents: number of entries in meta_sg + * @meta_sg_offset: offset in bytes into meta_sg + * @page_size: page vector desired page size + * + * Constraints: + * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY. + * + * Return: 0 on success. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset, unsigned int page_size) +{ + if (unlikely(!mr->device->ops.map_mr_sg_pi || + WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY))) + return -EOPNOTSUPP; + + mr->page_size = page_size; + + return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, + meta_sg_nents, meta_sg_offset); +} +EXPORT_SYMBOL(ib_map_mr_sg_pi); + /** * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list * and set it the memory region. @@ -2365,6 +2698,7 @@ EXPORT_SYMBOL(ib_set_vf_guid); * @page_size: page vector desired page size * * Constraints: + * * - The first sg element is allowed to have an offset. * - Each sg element must either be aligned to page_size or virtually * contiguous to the previous element. In case an sg element has a @@ -2398,10 +2732,12 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg - * @sg_offset_p: IN: start offset in bytes into sg - * OUT: offset in bytes for element n of the sg of the first + * @sg_offset_p: ==== ======================================================= + * IN start offset in bytes into sg + * OUT offset in bytes for element n of the sg of the first * byte that has not been processed where n is the return * value of this function. + * ==== ======================================================= * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest @@ -2566,6 +2902,72 @@ static void __ib_drain_rq(struct ib_qp *qp) wait_for_completion(&rdrain.done); } +/* + * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout + * expires. + * @qp: queue pair associated with SRQ to drain + * + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the first option. + */ +static void __ib_drain_srq(struct ib_qp *qp) +{ + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_cq *cq; + int n, polled = 0; + int ret; + + if (!qp->srq) { + WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp); + return; + } + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain shared recv queue: %d\n", ret); + return; + } + + if (ib_srq_has_cq(qp->srq->srq_type)) { + cq = qp->srq->ext.cq; + } else if (qp->recv_cq) { + cq = qp->recv_cq; + } else { + WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp); + return; + } + + if (wait_for_completion_timeout(&qp->srq_completion, 60 * HZ) > 0) { + while (polled != cq->cqe) { + n = ib_process_cq_direct(cq, cq->cqe - polled); + if (!n) + return; + polled += n; + } + } +} + /** * ib_drain_sq() - Block until all SQ CQEs have been consumed by the * application. @@ -2591,6 +2993,7 @@ void ib_drain_sq(struct ib_qp *qp) qp->device->ops.drain_sq(qp); else __ib_drain_sq(qp); + trace_cq_drain_complete(qp->send_cq); } EXPORT_SYMBOL(ib_drain_sq); @@ -2619,6 +3022,7 @@ void ib_drain_rq(struct ib_qp *qp) qp->device->ops.drain_rq(qp); else __ib_drain_rq(qp); + trace_cq_drain_complete(qp->recv_cq); } EXPORT_SYMBOL(ib_drain_rq); @@ -2642,10 +3046,12 @@ void ib_drain_qp(struct ib_qp *qp) ib_drain_sq(qp); if (!qp->srq) ib_drain_rq(qp); + else + __ib_drain_srq(qp); } EXPORT_SYMBOL(ib_drain_qp); -struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num, +struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)) @@ -2671,7 +3077,7 @@ struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num, } EXPORT_SYMBOL(rdma_alloc_netdev); -int rdma_init_netdev(struct ib_device *device, u8 port_num, +int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), @@ -2692,3 +3098,90 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num, netdev, params.param); } EXPORT_SYMBOL(rdma_init_netdev); + +void __rdma_block_iter_start(struct ib_block_iter *biter, + struct scatterlist *sglist, unsigned int nents, + unsigned long pgsz) +{ + memset(biter, 0, sizeof(struct ib_block_iter)); + biter->__sg = sglist; + biter->__sg_nents = nents; + + /* Driver provides best block size to use */ + biter->__pg_bit = __fls(pgsz); +} +EXPORT_SYMBOL(__rdma_block_iter_start); + +bool __rdma_block_iter_next(struct ib_block_iter *biter) +{ + unsigned int block_offset; + unsigned int delta; + + if (!biter->__sg_nents || !biter->__sg) + return false; + + biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; + block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); + delta = BIT_ULL(biter->__pg_bit) - block_offset; + + while (biter->__sg_nents && biter->__sg && + sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) { + delta -= sg_dma_len(biter->__sg) - biter->__sg_advance; + biter->__sg_advance = 0; + biter->__sg = sg_next(biter->__sg); + biter->__sg_nents--; + } + biter->__sg_advance += delta; + + return true; +} +EXPORT_SYMBOL(__rdma_block_iter_next); + +/** + * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct + * for the drivers. + * @descs: array of static descriptors + * @num_counters: number of elements in array + * @lifespan: milliseconds between updates + */ +struct rdma_hw_stats *rdma_alloc_hw_stats_struct( + const struct rdma_stat_desc *descs, int num_counters, + unsigned long lifespan) +{ + struct rdma_hw_stats *stats; + + stats = kzalloc(struct_size(stats, value, num_counters), GFP_KERNEL); + if (!stats) + return NULL; + + stats->is_disabled = kcalloc(BITS_TO_LONGS(num_counters), + sizeof(*stats->is_disabled), GFP_KERNEL); + if (!stats->is_disabled) + goto err; + + stats->descs = descs; + stats->num_counters = num_counters; + stats->lifespan = msecs_to_jiffies(lifespan); + mutex_init(&stats->lock); + + return stats; + +err: + kfree(stats); + return NULL; +} +EXPORT_SYMBOL(rdma_alloc_hw_stats_struct); + +/** + * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats + * @stats: statistics to release + */ +void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats) +{ + if (!stats) + return; + + kfree(stats->is_disabled); + kfree(stats); +} +EXPORT_SYMBOL(rdma_free_hw_stats_struct); |
