diff options
Diffstat (limited to 'drivers/infiniband/core/cma.c')
| -rw-r--r-- | drivers/infiniband/core/cma.c | 3554 |
1 files changed, 2329 insertions, 1225 deletions
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0eb393237ba2..95e89f5c147c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1,36 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. - * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. */ #include <linux/completion.h> @@ -38,8 +11,9 @@ #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> +#include <linux/rbtree.h> #include <linux/igmp.h> -#include <linux/idr.h> +#include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> @@ -47,6 +21,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> @@ -62,16 +37,17 @@ #include <rdma/iw_cm.h> #include "core_priv.h" +#include "cma_priv.h" +#include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 -#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000 #define CMA_MAX_CM_RETRIES 15 -#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) -#define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_IBOE_PACKET_LIFETIME 16 +#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", @@ -92,6 +68,11 @@ static const char * const cma_events[] = { [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type); + +static void cma_netevent_work_handler(struct work_struct *_work); + const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; @@ -115,7 +96,13 @@ const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, } EXPORT_SYMBOL(rdma_reject_msg); -bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) +/** + * rdma_is_consumer_reject - return true if the consumer rejected the connect + * request. + * @id: Communication identifier that received the REJECT event. + * @reason: Value returned in the REJECT event status field. + */ +static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; @@ -126,7 +113,6 @@ bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) WARN_ON_ONCE(1); return false; } -EXPORT_SYMBOL(rdma_is_consumer_reject); const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) @@ -144,7 +130,22 @@ const void *rdma_consumer_reject_data(struct rdma_cm_id *id, } EXPORT_SYMBOL(rdma_consumer_reject_data); -static void cma_add_one(struct ib_device *device); +/** + * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. + * @id: Communication Identifier + */ +struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id->device->node_type == RDMA_NODE_RNIC) + return id_priv->cm_id.iw; + return NULL; +} +EXPORT_SYMBOL(rdma_iw_cm_id); + +static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { @@ -154,18 +155,20 @@ static struct ib_client cma_client = { }; static struct ib_sa_client sa_client; -static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); +static struct rb_root id_table = RB_ROOT; +/* Serialize operations of id_table tree */ +static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { - struct idr tcp_ps; - struct idr udp_ps; - struct idr ipoib_ps; - struct idr ib_ps; + struct xarray tcp_ps; + struct xarray udp_ps; + struct xarray ipoib_ps; + struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) @@ -173,7 +176,8 @@ static struct cma_pernet *cma_pernet(struct net *net) return net_generic(net, cma_pernet_id); } -static struct idr *cma_pernet_idr(struct net *net, enum rdma_port_space ps) +static +struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); @@ -191,60 +195,64 @@ static struct idr *cma_pernet_idr(struct net *net, enum rdma_port_space ps) } } +struct id_table_entry { + struct list_head id_list; + struct rb_node rb_node; +}; + struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; - atomic_t refcount; + refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { - enum rdma_port_space ps; + enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; -struct class_port_info_context { - struct ib_class_port_info *class_port_info; - struct ib_device *device; - struct completion done; - struct ib_sa_query *sa_query; - u8 port_num; -}; - -static int cma_ps_alloc(struct net *net, enum rdma_port_space ps, +static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); + return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, - enum rdma_port_space ps, int snum) + enum rdma_ucm_port_space ps, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - return idr_find(idr, snum); + return xa_load(xa, snum); } -static void cma_ps_remove(struct net *net, enum rdma_port_space ps, int snum) +static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, + int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - idr_remove(idr, snum); + xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; -void cma_ref_dev(struct cma_device *cma_dev) +void cma_dev_get(struct cma_device *cma_dev) { - atomic_inc(&cma_dev->refcount); + refcount_inc(&cma_dev->refcount); +} + +void cma_dev_put(struct cma_device *cma_dev) +{ + if (refcount_dec_and_test(&cma_dev->refcount)) + complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, @@ -262,13 +270,13 @@ struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, } if (found_cma_dev) - cma_ref_dev(found_cma_dev); + cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, - unsigned int port) + u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; @@ -277,7 +285,7 @@ int cma_get_default_gid_type(struct cma_device *cma_dev, } int cma_set_default_gid_type(struct cma_device *cma_dev, - unsigned int port, + u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; @@ -285,6 +293,10 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; + if (default_gid_type == IB_GID_TYPE_IB && + rdma_protocol_roce_eth_encap(cma_dev->device, port)) + default_gid_type = IB_GID_TYPE_ROCE; + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) @@ -296,7 +308,7 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, return 0; } -int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port) +int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; @@ -304,7 +316,7 @@ int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port) return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } -int cma_set_default_roce_tos(struct cma_device *cma_dev, unsigned int port, +int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) @@ -326,57 +338,19 @@ struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ -struct rdma_id_private { - struct rdma_cm_id id; - - struct rdma_bind_list *bind_list; - struct hlist_node node; - struct list_head list; /* listen_any_list or cma_device.list */ - struct list_head listen_list; /* per device listens */ - struct cma_device *cma_dev; - struct list_head mc_list; - - int internal_id; - enum rdma_cm_state state; - spinlock_t lock; - struct mutex qp_mutex; - - struct completion comp; - atomic_t refcount; - struct mutex handler_mutex; - - int backlog; - int timeout_ms; - struct ib_sa_query *query; - int query_id; - union { - struct ib_cm_id *ib; - struct iw_cm_id *iw; - } cm_id; - - u32 seq_num; - u32 qkey; - u32 qp_num; - pid_t owner; - u32 options; - u8 srq; - u8 tos; - bool tos_set; - u8 reuseaddr; - u8 afonly; - enum ib_gid_type gid_type; -}; struct cma_multicast { struct rdma_id_private *id_priv; union { - struct ib_sa_multicast *ib; - } multicast; + struct ib_sa_multicast *sa_mc; + struct { + struct work_struct work; + struct rdma_cm_event event; + } iboe_join; + }; struct list_head list; void *context; struct sockaddr_storage addr; - struct kref mcref; - bool igmp_joined; u8 join_state; }; @@ -388,18 +362,6 @@ struct cma_work { struct rdma_cm_event event; }; -struct cma_ndev_work { - struct work_struct work; - struct rdma_id_private *id; - struct rdma_cm_event event; -}; - -struct iboe_mcast_work { - struct work_struct work; - struct rdma_id_private *id; - struct cma_multicast *mc; -}; - union cma_ip_addr { struct in6_addr ip6; struct { @@ -419,31 +381,31 @@ struct cma_hdr { #define CMA_VERSION 0x00 struct cma_req_info { + struct sockaddr_storage listen_addr_storage; + struct sockaddr_storage src_addr_storage; struct ib_device *device; - int port; union ib_gid local_gid; __be64 service_id; + int port; + bool has_gid; u16 pkey; - bool has_gid:1; }; -static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&id_priv->lock, flags); - ret = (id_priv->state == comp); - spin_unlock_irqrestore(&id_priv->lock, flags); - return ret; -} - static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; + /* + * The FSM uses a funny double locking where state is protected by both + * the handler_mutex and the spinlock. State is not allowed to change + * to/from a handler_mutex protected value without also holding + * handler_mutex. + */ + if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; @@ -451,27 +413,24 @@ static int cma_comp_exch(struct rdma_id_private *id_priv, return ret; } -static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, - enum rdma_cm_state exch) +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { - unsigned long flags; - enum rdma_cm_state old; + return hdr->ip_version >> 4; +} - spin_lock_irqsave(&id_priv->lock, flags); - old = id_priv->state; - id_priv->state = exch; - spin_unlock_irqrestore(&id_priv->lock, flags); - return old; +static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +{ + hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) +static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { - return hdr->ip_version >> 4; + return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } -static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { - hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); + return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) @@ -494,16 +453,135 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) return (in_dev) ? 0 : -ENODEV; } +static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, + struct id_table_entry *entry_b) +{ + struct rdma_id_private *id_priv = list_first_entry( + &entry_b->id_list, struct rdma_id_private, id_list_entry); + int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; + struct sockaddr *sb = cma_dst_addr(id_priv); + + if (ifindex_a != ifindex_b) + return (ifindex_a > ifindex_b) ? 1 : -1; + + if (sa->sa_family != sb->sa_family) + return sa->sa_family - sb->sa_family; + + if (sa->sa_family == AF_INET && + __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { + return memcmp(&((struct sockaddr_in *)sa)->sin_addr, + &((struct sockaddr_in *)sb)->sin_addr, + sizeof(((struct sockaddr_in *)sa)->sin_addr)); + } + + if (sa->sa_family == AF_INET6 && + __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { + return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, + &((struct sockaddr_in6 *)sb)->sin6_addr); + } + + return -1; +} + +static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) +{ + struct rb_node **new, *parent = NULL; + struct id_table_entry *this, *node; + unsigned long flags; + int result; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + spin_lock_irqsave(&id_table_lock, flags); + new = &id_table.rb_node; + while (*new) { + this = container_of(*new, struct id_table_entry, rb_node); + result = compare_netdev_and_ip( + node_id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(node_id_priv), this); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else { + list_add_tail(&node_id_priv->id_list_entry, + &this->id_list); + kfree(node); + goto unlock; + } + } + + INIT_LIST_HEAD(&node->id_list); + list_add_tail(&node_id_priv->id_list_entry, &node->id_list); + + rb_link_node(&node->rb_node, parent, new); + rb_insert_color(&node->rb_node, &id_table); + +unlock: + spin_unlock_irqrestore(&id_table_lock, flags); + return 0; +} + +static struct id_table_entry * +node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) +{ + struct rb_node *node = root->rb_node; + struct id_table_entry *data; + int result; + + while (node) { + data = container_of(node, struct id_table_entry, rb_node); + result = compare_netdev_and_ip(ifindex, sa, data); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + + return NULL; +} + +static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) +{ + struct id_table_entry *data; + unsigned long flags; + + spin_lock_irqsave(&id_table_lock, flags); + if (list_empty(&id_priv->id_list_entry)) + goto out; + + data = node_from_ndev_ip(&id_table, + id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(id_priv)); + if (!data) + goto out; + + list_del_init(&id_priv->id_list_entry); + if (list_empty(&data->id_list)) { + rb_erase(&data->rb_node, &id_table); + kfree(data); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); +} + static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { - cma_ref_dev(cma_dev); + cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; - id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); - list_add_tail(&id_priv->list, &cma_dev->id_list); + list_add_tail(&id_priv->device_item, &cma_dev->id_list); + + trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -515,60 +593,30 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, rdma_start_port(cma_dev->device)]; } -void cma_deref_dev(struct cma_device *cma_dev) -{ - if (atomic_dec_and_test(&cma_dev->refcount)) - complete(&cma_dev->comp); -} - -static inline void release_mc(struct kref *kref) -{ - struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); - - kfree(mc->multicast.ib); - kfree(mc); -} - static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); - list_del(&id_priv->list); - cma_deref_dev(id_priv->cma_dev); + list_del_init(&id_priv->device_item); + cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; + id_priv->id.device = NULL; + if (id_priv->id.route.addr.dev_addr.sgid_attr) { + rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); + id_priv->id.route.addr.dev_addr.sgid_attr = NULL; + } mutex_unlock(&lock); } -static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.src_addr; -} - -static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; -} - static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } -static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) { - if (qkey && id_priv->qkey != qkey) - return -EINVAL; - return 0; - } - - if (qkey) { - id_priv->qkey = qkey; - return 0; - } - switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: @@ -588,6 +636,16 @@ static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) return ret; } +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +{ + if (!qkey || + (id_priv->qkey && (id_priv->qkey != qkey))) + return -EINVAL; + + id_priv->qkey = qkey; + return 0; +} + static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; @@ -600,7 +658,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a int ret; if (addr->sa_family != AF_IB) { - ret = rdma_translate_ip(addr, dev_addr, NULL); + ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; @@ -609,99 +667,257 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a return ret; } -static inline int cma_validate_port(struct ib_device *device, u8 port, - enum ib_gid_type gid_type, - union ib_gid *gid, int dev_type, - int bound_if_index) +static const struct ib_gid_attr * +cma_validate_port(struct ib_device *device, u32 port, + enum ib_gid_type gid_type, + union ib_gid *gid, + struct rdma_id_private *id_priv) { - int ret = -ENODEV; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); + int bound_if_index = dev_addr->bound_dev_if; + int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + struct net_device *pdev = NULL; + + if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) + goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) - return ret; + goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) - return ret; + goto out; - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) - ndev = dev_get_by_index(&init_net, bound_if_index); - else - gid_type = IB_GID_TYPE_IB; + /* + * For drivers that do not associate more than one net device with + * their gid tables, such as iWARP drivers, it is sufficient to + * return the first table entry. + * + * Other driver classes might be included in the future. + */ + if (rdma_protocol_iwarp(device, port)) { + sgid_attr = rdma_get_gid_attr(device, port, 0); + if (IS_ERR(sgid_attr)) + goto out; + rcu_read_lock(); + ndev = rcu_dereference(sgid_attr->ndev); + if (ndev->ifindex != bound_if_index) { + pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index); + if (pdev) { + if (is_vlan_dev(pdev)) { + pdev = vlan_dev_real_dev(pdev); + if (ndev->ifindex == pdev->ifindex) + bound_if_index = pdev->ifindex; + } + if (is_vlan_dev(ndev)) { + pdev = vlan_dev_real_dev(ndev); + if (bound_if_index == pdev->ifindex) + bound_if_index = ndev->ifindex; + } + } + } + if (!net_eq(dev_net(ndev), dev_addr->net) || + ndev->ifindex != bound_if_index) { + rdma_put_gid_attr(sgid_attr); + sgid_attr = ERR_PTR(-ENODEV); + } + rcu_read_unlock(); + goto out; + } - ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, - ndev, NULL); + /* + * For a RXE device, it should work with TUN device and normal ethernet + * devices. Use driver_id to check if a device is a RXE device or not. + * ARPHDR_NONE means a TUN device. + */ + if (device->ops.driver_id == RDMA_DRIVER_RXE) { + if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER) + && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } + } else { + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + goto out; + } else { + gid_type = IB_GID_TYPE_IB; + } + } - if (ndev) - dev_put(ndev); + sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); + dev_put(ndev); +out: + return sgid_attr; +} - return ret; +static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, + const struct ib_gid_attr *sgid_attr) +{ + WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); + id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } -static int cma_acquire_dev(struct rdma_id_private *id_priv, - struct rdma_id_private *listen_id_priv) +/** + * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute + * based on source ip address. + * @id_priv: cm_id which should be bound to cma device + * + * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute + * based on source IP address. It returns 0 on success or error code otherwise. + * It is applicable to active and passive side cm_id. + */ +static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - struct cma_device *cma_dev; + const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; int ret = -ENODEV; - u8 port; + u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; - mutex_lock(&lock); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + - rdma_addr_gid_offset(dev_addr), sizeof gid); - - if (listen_id_priv) { - cma_dev = listen_id_priv->cma_dev; - port = listen_id_priv->id.port_num; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; - - ret = cma_validate_port(cma_dev->device, port, - rdma_protocol_ib(cma_dev->device, port) ? - IB_GID_TYPE_IB : - listen_id_priv->gid_type, gidp, - dev_addr->dev_type, - dev_addr->bound_dev_if); - if (!ret) { - id_priv->id.port_num = port; - goto out; + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) { + rdma_for_each_port (cma_dev->device, port) { + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + cma_attach_to_dev(id_priv, cma_dev); + ret = 0; + goto out; + } } } +out: + mutex_unlock(&lock); + return ret; +} + +/** + * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute + * @id_priv: cm id to bind to cma device + * @listen_id_priv: listener cm id to match against + * @req: Pointer to req structure containaining incoming + * request information + * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when + * rdma device matches for listen_id and incoming request. It also verifies + * that a GID table entry is present for the source address. + * Returns 0 on success, or returns error code otherwise. + */ +static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv, + struct cma_req_info *req) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + enum ib_gid_type gid_type; + union ib_gid gid; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + if (rdma_protocol_roce(req->device, req->port)) + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &gid); + else + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; + sgid_attr = cma_validate_port(req->device, req->port, + gid_type, &gid, id_priv); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + id_priv->id.port_num = req->port; + cma_bind_sgid_attr(id_priv, sgid_attr); + /* Need to acquire lock to protect against reader + * of cma_dev->id_list such as cma_netdev_callback() and + * cma_process_remove(). + */ + mutex_lock(&lock); + cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); + mutex_unlock(&lock); + rdma_restrack_add(&id_priv->res); + return 0; +} + +static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; + int ret = -ENODEV; + union ib_gid gid; + u32 port; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + gid_type = listen_id_priv->gid_type; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, &gid, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; + goto out; + } list_for_each_entry(cma_dev, &dev_list, list) { - for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { - if (listen_id_priv && - listen_id_priv->cma_dev == cma_dev && + rdma_for_each_port (cma_dev->device, port) { + if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; - - ret = cma_validate_port(cma_dev->device, port, - rdma_protocol_ib(cma_dev->device, port) ? - IB_GID_TYPE_IB : - cma_dev->default_gid_type[port - 1], - gidp, dev_addr->dev_type, - dev_addr->bound_dev_if); - if (!ret) { + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, &gid, id_priv); + if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; goto out; } } } out: - if (!ret) + if (!ret) { cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); + } mutex_unlock(&lock); return ret; @@ -715,9 +931,10 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; + unsigned int p; u16 pkey, index; - u8 p; enum ib_port_state port_state; + int ret; int i; cma_dev = NULL; @@ -725,8 +942,9 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); + mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { - for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; @@ -735,9 +953,14 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; - for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, - &gid, NULL); - i++) { + + for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; + ++i) { + ret = rdma_query_gid(cur_dev->device, p, i, + &gid); + if (ret) + continue; + if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; @@ -751,32 +974,39 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; + goto found; } } } } - - if (!cma_dev) - return -ENODEV; + mutex_unlock(&lock); + return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); - addr = (struct sockaddr_ib *) cma_src_addr(id_priv); - memcpy(&addr->sib_addr, &sgid, sizeof sgid); + rdma_restrack_add(&id_priv->res); + mutex_unlock(&lock); + addr = (struct sockaddr_ib *)cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } -static void cma_deref_id(struct rdma_id_private *id_priv) +static void cma_id_get(struct rdma_id_private *id_priv) +{ + refcount_inc(&id_priv->refcount); +} + +static void cma_id_put(struct rdma_id_private *id_priv) { - if (atomic_dec_and_test(&id_priv->refcount)) + if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } -struct rdma_cm_id *rdma_create_id(struct net *net, - rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type) +static struct rdma_id_private * +__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; @@ -784,26 +1014,68 @@ struct rdma_cm_id *rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - id_priv->owner = task_pid_nr(current); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->timeout_set = false; + id_priv->min_rnr_timer_set = false; + id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); - atomic_set(&id_priv->refcount, 1); + refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); + INIT_LIST_HEAD(&id_priv->device_item); + INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); + id_priv->seq_num &= 0x00ffffff; + INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler); + + rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); + if (parent) + rdma_restrack_parent_name(&id_priv->res, &parent->res); + + return id_priv; +} - return &id_priv->id; +struct rdma_cm_id * +__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const char *caller) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, caller); + return &ret->id; +} +EXPORT_SYMBOL(__rdma_create_kernel_id); + +struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, + void *context, + enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, + ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, NULL); + return &ret->id; } -EXPORT_SYMBOL(rdma_create_id); +EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { @@ -852,27 +1124,34 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (id->device != pd->device) - return -EINVAL; + if (id->device != pd->device) { + ret = -EINVAL; + goto out_err; + } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); - if (IS_ERR(qp)) - return PTR_ERR(qp); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto out_err; + } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) - goto err; + goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); + trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; -err: +out_destroy: ib_destroy_qp(qp); +out_err: + trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); @@ -882,6 +1161,7 @@ void rdma_destroy_qp(struct rdma_cm_id *id) struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); + trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; @@ -894,7 +1174,6 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; - union ib_gid sgid; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { @@ -917,12 +1196,6 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, if (ret) goto out; - ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, - rdma_ah_read_grh(&qp_attr.ah_attr)->sgid_index, - &sgid, NULL); - if (ret) - goto out; - BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) @@ -997,7 +1270,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv, 0); + ret = cma_set_default_qkey(id_priv); if (ret) return ret; @@ -1035,65 +1308,88 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; - } else + } else { ret = -ENOSYS; + } + + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + + if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) + qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); -static inline int cma_zero_addr(struct sockaddr *addr) +static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: - return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr); + return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: - return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); + return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: - return 0; + return false; } } -static inline int cma_loopback_addr(struct sockaddr *addr) +static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: - return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + return ipv4_is_loopback( + ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: - return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); + return ipv6_addr_loopback( + &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: - return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); + return ib_addr_loopback( + &((struct sockaddr_ib *)addr)->sib_addr); default: - return 0; + return false; } } -static inline int cma_any_addr(struct sockaddr *addr) +static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } -static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) +static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: - return ((struct sockaddr_in *) src)->sin_addr.s_addr != - ((struct sockaddr_in *) dst)->sin_addr.s_addr; - case AF_INET6: - return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, - &((struct sockaddr_in6 *) dst)->sin6_addr); + return ((struct sockaddr_in *)src)->sin_addr.s_addr != + ((struct sockaddr_in *)dst)->sin_addr.s_addr; + case AF_INET6: { + struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; + bool link_local; + + if (ipv6_addr_cmp(&src_addr6->sin6_addr, + &dst_addr6->sin6_addr)) + return 1; + link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & + IPV6_ADDR_LINKLOCAL; + /* Link local must match their scope_ids */ + return link_local ? (src_addr6->sin6_scope_id != + dst_addr6->sin6_scope_id) : + 0; + } + default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } -static __be16 cma_port(struct sockaddr *addr) +static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; @@ -1111,15 +1407,15 @@ static __be16 cma_port(struct sockaddr *addr) } } -static inline int cma_any_port(struct sockaddr *addr) +static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct rdma_cm_id *listen_id, - struct sa_path_rec *path) + const struct rdma_cm_id *listen_id, + const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; @@ -1204,7 +1500,7 @@ static u16 cma_port_from_service_id(__be64 service_id) static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct ib_cm_event *ib_event, + const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; @@ -1234,8 +1530,8 @@ static int cma_save_ip_info(struct sockaddr *src_addr, static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event, + const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { @@ -1308,7 +1604,7 @@ static bool validate_ipv4_net_dev(struct net_device *net_dev, return false; memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_iif = net_dev->ifindex; + fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; @@ -1329,7 +1625,7 @@ static bool validate_ipv6_net_dev(struct net_device *net_dev, IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, - strict); + NULL, strict); bool ret; if (!rt) @@ -1367,12 +1663,36 @@ static bool validate_net_dev(struct net_device *net_dev, } } -static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, - const struct cma_req_info *req) +static struct net_device * +roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) +{ + const struct ib_gid_attr *sgid_attr = NULL; + struct net_device *ndev; + + if (ib_event->event == IB_CM_REQ_RECEIVED) + sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; + + if (!sgid_attr) + return NULL; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); + if (IS_ERR(ndev)) + ndev = NULL; + else + dev_hold(ndev); + rcu_read_unlock(); + return ndev; +} + +static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, + struct cma_req_info *req) { - struct sockaddr_storage listen_addr_storage, src_addr_storage; - struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage, - *src_addr = (struct sockaddr *)&src_addr_storage; + struct sockaddr *listen_addr = + (struct sockaddr *)&req->listen_addr_storage; + struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; @@ -1382,20 +1702,19 @@ static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, if (err) return ERR_PTR(err); - net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, - gid, listen_addr); + if (rdma_protocol_roce(req->device, req->port)) + net_dev = roce_get_net_dev_by_cm_event(ib_event); + else + net_dev = ib_get_net_dev_by_params(req->device, req->port, + req->pkey, + gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); - if (!validate_net_dev(net_dev, listen_addr, src_addr)) { - dev_put(net_dev); - return ERR_PTR(-EHOSTUNREACH); - } - return net_dev; } -static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id) +static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } @@ -1436,38 +1755,52 @@ static bool cma_match_private_data(struct rdma_id_private *id_priv, return true; } -static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num) +static bool cma_protocol_roce(const struct rdma_cm_id *id) { - enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num); - enum rdma_transport_type transport = - rdma_node_get_transport(device->node_type); + struct ib_device *device = id->device; + const u32 port_num = id->port_num ?: rdma_start_port(device); - return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB; + return rdma_protocol_roce(device, port_num); } -static bool cma_protocol_roce(const struct rdma_cm_id *id) +static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { - struct ib_device *device = id->device; - const int port_num = id->port_num ?: rdma_start_port(device); + const struct sockaddr *daddr = + (const struct sockaddr *)&req->listen_addr_storage; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; - return cma_protocol_roce_dev_port(device, port_num); + /* Returns true if the req is for IPv6 link local */ + return (daddr->sa_family == AF_INET6 && + (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, - u8 port_num) + const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) - /* This request is an AF_IB request or a RoCE request */ - return (!id->port_num || id->port_num == port_num) && - (addr->src_addr.ss_family == AF_IB || - cma_protocol_roce_dev_port(id->device, port_num)); + /* This request is an AF_IB request */ + return (!id->port_num || id->port_num == req->port) && + (addr->src_addr.ss_family == AF_IB); - return !addr->dev_addr.bound_dev_if || - (net_eq(dev_net(net_dev), addr->dev_addr.net) && - addr->dev_addr.bound_dev_if == net_dev->ifindex); + /* + * If the request is not for IPv6 link local, allow matching + * request to any netdevice of the one or multiport rdma device. + */ + if (!cma_is_req_ipv6_ll(req)) + return true; + /* + * Net namespaces must match, and if the listner is listening + * on a specific netdevice than netdevice must match as well. + */ + if (net_eq(dev_net(net_dev), addr->dev_addr.net) && + (!!addr->dev_addr.bound_dev_if == + (addr->dev_addr.bound_dev_if == net_dev->ifindex))) + return true; + else + return false; } static struct rdma_id_private *cma_find_listener( @@ -1479,19 +1812,22 @@ static struct rdma_id_private *cma_find_listener( { struct rdma_id_private *id_priv, *id_priv_dev; + lockdep_assert_held(&lock); + if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && - cma_match_net_dev(&id_priv->id, net_dev, req->port)) + cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, - listen_list) { + listen_item) { if (id_priv_dev->id.device == cm_id->device && - cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + cma_match_net_dev(&id_priv_dev->id, + net_dev, req)) return id_priv_dev; } } @@ -1500,46 +1836,81 @@ static struct rdma_id_private *cma_find_listener( return ERR_PTR(-EINVAL); } -static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, - struct ib_cm_event *ib_event, - struct net_device **net_dev) +static struct rdma_id_private * +cma_ib_id_from_event(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event, + struct cma_req_info *req, + struct net_device **net_dev) { - struct cma_req_info req; struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; - err = cma_save_req_info(ib_event, &req); + err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); - *net_dev = cma_get_net_dev(ib_event, &req); + *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; - } else if (cma_protocol_roce_dev_port(req.device, req.port)) { - /* TODO find the net dev matching the request parameters - * through the RoCE GID table */ - *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } + mutex_lock(&lock); + /* + * Net namespace might be getting deleted while route lookup, + * cm_id lookup is in progress. Therefore, perform netdevice + * validation, cm_id lookup under rcu lock. + * RCU lock along with netdevice state check, synchronizes with + * netdevice migrating to different net namespace and also avoids + * case where net namespace doesn't get deleted while lookup is in + * progress. + * If the device state is not IFF_UP, its properties such as ifindex + * and nd_net cannot be trusted to remain valid without rcu lock. + * net/core/dev.c change_net_namespace() ensures to synchronize with + * ongoing operations on net device after device is closed using + * synchronize_net(). + */ + rcu_read_lock(); + if (*net_dev) { + /* + * If netdevice is down, it is likely that it is administratively + * down or it might be migrating to different namespace. + * In that case avoid further processing, as the net namespace + * or ifindex may change. + */ + if (((*net_dev)->flags & IFF_UP) == 0) { + id_priv = ERR_PTR(-EHOSTUNREACH); + goto err; + } + + if (!validate_net_dev(*net_dev, + (struct sockaddr *)&req->src_addr_storage, + (struct sockaddr *)&req->listen_addr_storage)) { + id_priv = ERR_PTR(-EHOSTUNREACH); + goto err; + } + } + bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, - rdma_ps_from_service_id(req.service_id), - cma_port_from_service_id(req.service_id)); - id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + rdma_ps_from_service_id(req->service_id), + cma_port_from_service_id(req->service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); +err: + rcu_read_unlock(); + mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } - return id_priv; } -static inline int cma_user_data_offset(struct rdma_id_private *id_priv) +static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } @@ -1552,28 +1923,36 @@ static void cma_cancel_route(struct rdma_id_private *id_priv) } } -static void cma_cancel_listens(struct rdma_id_private *id_priv) +static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; + lockdep_assert_held(&lock); + /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ - mutex_lock(&lock); - list_del(&id_priv->list); + list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { - dev_id_priv = list_entry(id_priv->listen_list.next, - struct rdma_id_private, listen_list); + dev_id_priv = + list_first_entry(&id_priv->listen_list, + struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ - list_del_init(&dev_id_priv->list); - list_del(&dev_id_priv->listen_list); + list_del_init(&dev_id_priv->device_item); + list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + _cma_cancel_listens(id_priv); mutex_unlock(&lock); } @@ -1582,6 +1961,14 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv, { switch (state) { case RDMA_CM_ADDR_QUERY: + /* + * We can avoid doing the rdma_addr_cancel() based on state, + * only RDMA_CM_ADDR_QUERY has a work that could still execute. + * Notice that the addr_handler work could still be exiting + * outside this state, however due to the interaction with the + * handler_mutex the work is guaranteed not to touch id_priv + * during exit. + */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: @@ -1613,55 +2000,60 @@ static void cma_release_port(struct rdma_id_private *id_priv) mutex_unlock(&lock); } +static void destroy_mc(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); + + if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) + ib_sa_free_multicast(mc->sa_mc); + + if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev && !send_only) { + enum ib_gid_type gid_type; + union ib_gid mgid; + + gid_type = id_priv->cma_dev->default_gid_type + [id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, + gid_type); + cma_igmp_send(ndev, &mgid, false); + } + dev_put(ndev); + + cancel_work_sync(&mc->iboe_join.work); + } + kfree(mc); +} + static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { - mc = container_of(id_priv->mc_list.next, - struct cma_multicast, list); + mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, + list); list_del(&mc->list); - if (rdma_cap_ib_mcast(id_priv->cma_dev->device, - id_priv->id.port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else { - if (mc->igmp_joined) { - struct rdma_dev_addr *dev_addr = - &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = NULL; - - if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, - dev_addr->bound_dev_if); - if (ndev) { - cma_igmp_send(ndev, - &mc->multicast.ib->rec.mgid, - false); - dev_put(ndev); - } - } - kref_put(&mc->mcref, release_mc); - } + destroy_mc(id_priv, mc); } } -void rdma_destroy_id(struct rdma_cm_id *id) +static void _destroy_id(struct rdma_id_private *id_priv, + enum rdma_cm_state state) { - struct rdma_id_private *id_priv; - enum rdma_cm_state state; - - id_priv = container_of(id, struct rdma_id_private, id); - state = cma_exch(id_priv, RDMA_CM_DESTROYING); cma_cancel_operation(id_priv, state); - /* - * Wait for any active callback to finish. New callbacks will find - * the id_priv state set to destroying and abort. - */ - mutex_lock(&id_priv->handler_mutex); - mutex_unlock(&id_priv->handler_mutex); - + rdma_restrack_del(&id_priv->res); + cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) @@ -1675,16 +2067,56 @@ void rdma_destroy_id(struct rdma_cm_id *id) } cma_release_port(id_priv); - cma_deref_id(id_priv); + cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) - cma_deref_id(id_priv->id.context); + cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); + kfree(id_priv->id.route.path_rec_inbound); + kfree(id_priv->id.route.path_rec_outbound); + kfree(id_priv->id.route.service_recs); + put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } + +/* + * destroy an ID from within the handler_mutex. This ensures that no other + * handlers can start running concurrently. + */ +static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) + __releases(&idprv->handler_mutex) +{ + enum rdma_cm_state state; + unsigned long flags; + + trace_cm_id_destroy(id_priv); + + /* + * Setting the state to destroyed under the handler mutex provides a + * fence against calling handler callbacks. If this is invoked due to + * the failure of a handler callback then it guarentees that no future + * handlers will be called. + */ + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + id_priv->state = RDMA_CM_DESTROYING; + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + _destroy_id(id_priv, state); +} + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); + destroy_id_handler_unlock(id_priv); +} EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) @@ -1699,6 +2131,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) if (ret) goto reject; + trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; @@ -1707,13 +2140,14 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); + trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, - struct ib_cm_rep_event_param *rep_data, + const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; @@ -1724,22 +2158,40 @@ static void cma_set_rep_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; + + event->ece.vendor_id = rep_data->ece.vendor_id; + event->ece.attr_mod = rep_data->ece.attr_mod; +} + +static int cma_cm_event_handler(struct rdma_id_private *id_priv, + struct rdma_cm_event *event) +{ + int ret; + + lockdep_assert_held(&id_priv->handler_mutex); + + trace_cm_event_handler(id_priv, event); + ret = id_priv->id.event_handler(&id_priv->id, event); + trace_cm_event_done(id_priv, event, ret); + return ret; } -static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +static int cma_ib_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; - struct rdma_cm_event event; - int ret = 0; + struct rdma_cm_event event = {}; + enum rdma_cm_state state; + int ret; mutex_lock(&id_priv->handler_mutex); + state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_CONNECT) || + state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - id_priv->state != RDMA_CM_DISCONNECT)) + state != RDMA_CM_DISCONNECT)) goto out; - memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: @@ -1747,9 +2199,11 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: - if (cma_comp(id_priv, RDMA_CM_CONNECT) && - (id_priv->id.qp_type != IB_QPT_UD)) - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + if (state == RDMA_CM_CONNECT && + (id_priv->id.qp_type != IB_QPT_UD)) { + trace_cm_prepare_mra(id_priv); + ib_prepare_cm_mra(cm_id); + } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : @@ -1765,7 +2219,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: - event.status = -ETIMEDOUT; /* fall through */ + event.status = -ETIMEDOUT; + fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, @@ -1794,24 +2249,24 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) goto out; } - ret = id_priv->id.event_handler(&id_priv->id, &event); + ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); - return ret; + return 0; } -static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event, - struct net_device *net_dev) +static struct rdma_id_private * +cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, + struct net_device *net_dev) { + struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; @@ -1821,33 +2276,34 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, ib_event->param.req_rcvd.primary_path->service_id; int ret; - id = rdma_create_id(listen_id->route.addr.dev_addr.net, - listen_id->event_handler, listen_id->context, - listen_id->ps, ib_event->param.req_rcvd.qp_type); - if (IS_ERR(id)) + listen_id_priv = container_of(listen_id, struct rdma_id_private, id); + id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, + listen_id->ps, + ib_event->param.req_rcvd.qp_type, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; - rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; - rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, - GFP_KERNEL); + rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; + rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, + sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; - if (rt->num_paths == 2) + if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { - ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); - if (ret) - goto err; + rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { @@ -1870,22 +2326,26 @@ err: return NULL; } -static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event, - struct net_device *net_dev) +static struct rdma_id_private * +cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, + struct net_device *net_dev) { + const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; - id = rdma_create_id(net, listen_id->event_handler, listen_id->context, - listen_id->ps, IB_QPT_UD); - if (IS_ERR(id)) + listen_id_priv = container_of(listen_id, struct rdma_id_private, id); + id_priv = __rdma_create_id(net, listen_id->event_handler, + listen_id->context, listen_id->ps, IB_QPT_UD, + listen_id_priv); + if (IS_ERR(id_priv)) return NULL; - id_priv = container_of(id, struct rdma_id_private, id); + id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, @@ -1893,9 +2353,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, goto err; if (net_dev) { - ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); - if (ret) - goto err; + rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), @@ -1913,7 +2371,7 @@ err: } static void cma_set_req_event_data(struct rdma_cm_event *event, - struct ib_cm_req_event_param *req_data, + const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; @@ -1925,9 +2383,13 @@ static void cma_set_req_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; + + event->ece.vendor_id = req_data->ece.vendor_id; + event->ece.attr_mod = req_data->ece.attr_mod; } -static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) +static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, + const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || @@ -1936,94 +2398,81 @@ static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_e (!id->qp_type)); } -static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +static int cma_ib_req_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; + struct cma_req_info req = {}; struct net_device *net_dev; - int offset, ret; + u8 offset; + int ret; - listen_id = cma_id_from_event(cm_id, ib_event, &net_dev); + listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); - if (!cma_check_req_qp_type(&listen_id->id, ib_event)) { + trace_cm_req_handler(listen_id, ib_event->event); + if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) { + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; - goto err1; + goto err_unlock; } - memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { - conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev); + conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { - conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev); + conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; - goto err1; + goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id, listen_id); - if (ret) - goto err2; + ret = cma_ib_acquire_dev(conn_id, listen_id, &req); + if (ret) { + destroy_id_handler_unlock(conn_id); + goto err_unlock; + } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; - /* - * Protect against the user destroying conn_id from another thread - * until we're done accessing it. - */ - atomic_inc(&conn_id->refcount); - ret = conn_id->id.event_handler(&conn_id->id, &event); - if (ret) - goto err3; - /* - * Acquire mutex to prevent user executing rdma_destroy_id() - * while we're accessing the cm_id. - */ - mutex_lock(&lock); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && - (conn_id->id.qp_type != IB_QPT_UD)) - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); - mutex_unlock(&lock); - mutex_unlock(&conn_id->handler_mutex); - mutex_unlock(&listen_id->handler_mutex); - cma_deref_id(conn_id); - if (net_dev) - dev_put(net_dev); - return 0; + ret = cma_cm_event_handler(conn_id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + conn_id->cm_id.ib = NULL; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + goto net_dev_put; + } -err3: - cma_deref_id(conn_id); - /* Destroy the CM ID by returning a non-zero value. */ - conn_id->cm_id.ib = NULL; -err2: - cma_exch(conn_id, RDMA_CM_DESTROYING); + if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && + conn_id->id.qp_type != IB_QPT_UD) { + trace_cm_prepare_mra(cm_id->context); + ib_prepare_cm_mra(cm_id); + } mutex_unlock(&conn_id->handler_mutex); -err1: + +err_unlock: mutex_unlock(&listen_id->handler_mutex); - if (conn_id) - rdma_destroy_id(&conn_id->id); net_dev_put: - if (net_dev) - dev_put(net_dev); + dev_put(net_dev); return ret; } @@ -2037,19 +2486,45 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) } EXPORT_SYMBOL(rdma_get_service_id); +void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, + union ib_gid *dgid) +{ + struct rdma_addr *addr = &cm_id->route.addr; + + if (!cm_id->device) { + if (sgid) + memset(sgid, 0, sizeof(*sgid)); + if (dgid) + memset(dgid, 0, sizeof(*dgid)); + return; + } + + if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { + if (sgid) + rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); + if (dgid) + rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); + } else { + if (sgid) + rdma_addr_get_sgid(&addr->dev_addr, sgid); + if (dgid) + rdma_addr_get_dgid(&addr->dev_addr, dgid); + } +} +EXPORT_SYMBOL(rdma_read_gids); + static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; - memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; @@ -2083,19 +2558,17 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) event.param.conn.responder_resources = iw_event->ord; break; default: - BUG_ON(1); + goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; - ret = id_priv->id.event_handler(&id_priv->id, &event); + ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } @@ -2107,44 +2580,48 @@ out: static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { - struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); - if (listen_id->state != RDMA_CM_LISTEN) + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ - new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, + conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, - listen_id->id.context, - RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(new_cm_id)) { + listen_id->id.context, RDMA_PS_TCP, + IB_QPT_RC, listen_id); + if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } - conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } conn_id->cm_id.iw = cm_id; @@ -2154,31 +2631,16 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - memset(&event, 0, sizeof event); - event.event = RDMA_CM_EVENT_CONNECT_REQUEST; - event.param.conn.private_data = iw_event->private_data; - event.param.conn.private_data_len = iw_event->private_data_len; - event.param.conn.initiator_depth = iw_event->ird; - event.param.conn.responder_resources = iw_event->ord; - - /* - * Protect against the user destroying conn_id from another thread - * until we're done accessing it. - */ - atomic_inc(&conn_id->refcount); - ret = conn_id->id.event_handler(&conn_id->id, &event); + ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; - cma_exch(conn_id, RDMA_CM_DESTROYING); - mutex_unlock(&conn_id->handler_mutex); - cma_deref_id(conn_id); - rdma_destroy_id(&conn_id->id); - goto out; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; } mutex_unlock(&conn_id->handler_mutex); - cma_deref_id(conn_id); out: mutex_unlock(&listen_id->handler_mutex); @@ -2193,7 +2655,8 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); - id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id); + id = ib_cm_insert_listen(id_priv->id.device, + cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; @@ -2212,7 +2675,11 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) if (IS_ERR(id)) return PTR_ERR(id); + mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; + id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -2233,54 +2700,88 @@ static int cma_listen_handler(struct rdma_cm_id *id, { struct rdma_id_private *id_priv = id->context; + /* Listening IDs are always destroyed on removal */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + return -1; + id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; + trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } -static void cma_listen_on_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) +static int cma_listen_on_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev, + struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; - struct rdma_cm_id *id; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; - if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) - return; + lockdep_assert_held(&lock); - id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, - id_priv->id.qp_type); - if (IS_ERR(id)) - return; + *to_destroy = NULL; + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) + return 0; - dev_id_priv = container_of(id, struct rdma_id_private, id); + dev_id_priv = + __rdma_create_id(net, cma_listen_handler, id_priv, + id_priv->id.ps, id_priv->id.qp_type, id_priv); + if (IS_ERR(dev_id_priv)) + return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); - list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); - atomic_inc(&id_priv->refcount); + rdma_restrack_add(&dev_id_priv->res); + cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; + mutex_lock(&id_priv->qp_mutex); + dev_id_priv->tos_set = id_priv->tos_set; + dev_id_priv->tos = id_priv->tos; + mutex_unlock(&id_priv->qp_mutex); - ret = rdma_listen(id, id_priv->backlog); + ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) - pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", - ret, cma_dev->device->name); + goto err_listen; + list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); + return 0; +err_listen: + /* Caller must destroy this after releasing lock */ + *to_destroy = dev_id_priv; + dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); + return ret; } -static void cma_listen_on_all(struct rdma_id_private *id_priv) +static int cma_listen_on_all(struct rdma_id_private *id_priv) { + struct rdma_id_private *to_destroy; struct cma_device *cma_dev; + int ret; mutex_lock(&lock); - list_add_tail(&id_priv->list, &listen_any_list); - list_for_each_entry(cma_dev, &dev_list, list) - cma_listen_on_dev(id_priv, cma_dev); + list_add_tail(&id_priv->listen_any_item, &listen_any_list); + list_for_each_entry(cma_dev, &dev_list, list) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) { + /* Prevent racing with cma_process_remove() */ + if (to_destroy) + list_del_init(&to_destroy->device_item); + goto err_listen; + } + } mutex_unlock(&lock); + return 0; + +err_listen: + _cma_cancel_listens(id_priv); + mutex_unlock(&lock); + if (to_destroy) + rdma_destroy_id(&to_destroy->id); + return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) @@ -2288,36 +2789,159 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; + mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. In case it is + * set before rdma_resolve_route, the value will also be used to determine + * PacketLifeTime for RoCE. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + mutex_unlock(&id_priv->qp_mutex); + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + +/** + * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the + * QP associated with a connection identifier. + * @id: Communication identifier to associated with service type. + * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK + * Timer Field" in the IBTA specification. + * + * This function should be called before rdma_connect() on active + * side, and on passive side before rdma_accept(). The timer value + * will be associated with the local QP. When it receives a send it is + * not read to handle, typically if the receive queue is empty, an RNR + * Retry NAK is returned to the requester with the min_rnr_timer + * encoded. The requester will then wait at least the time specified + * in the NAK before retrying. The default is zero, which translates + * to a minimum RNR Timer value of 655 ms. + * + * Return: 0 for success + */ +int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) +{ + struct rdma_id_private *id_priv; + + /* It is a five-bit value */ + if (min_rnr_timer & 0xe0) + return -EINVAL; + + if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->min_rnr_timer = min_rnr_timer; + id_priv->min_rnr_timer_set = true; + mutex_unlock(&id_priv->qp_mutex); + + return 0; +} +EXPORT_SYMBOL(rdma_set_min_rnr_timer); + +static int route_set_path_rec_inbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_inbound) { + route->path_rec_inbound = + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); + if (!route->path_rec_inbound) + return -ENOMEM; + } + + *route->path_rec_inbound = *path_rec; + return 0; +} + +static int route_set_path_rec_outbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_outbound) { + route->path_rec_outbound = + kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); + if (!route->path_rec_outbound) + return -ENOMEM; + } + + *route->path_rec_outbound = *path_rec; + return 0; +} + static void cma_query_handler(int status, struct sa_path_rec *path_rec, - void *context) + unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; + int i; route = &work->id->id.route; - if (!status) { - route->num_paths = 1; - *route->path_rec = *path_rec; - } else { - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; - work->event.status = status; - pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", - status); + if (status) + goto fail; + + for (i = 0; i < num_prs; i++) { + if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) + *route->path_rec = path_rec[i]; + else if (path_rec[i].flags & IB_PATH_INBOUND) + status = route_set_path_rec_inbound(work, &path_rec[i]); + else if (path_rec[i].flags & IB_PATH_OUTBOUND) + status = route_set_path_rec_outbound(work, + &path_rec[i]); + else + status = -EINVAL; + + if (status) + goto fail; } + route->num_pri_alt_paths = 1; + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + work->event.status = status; + pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", + status); queue_work(cma_wq, &work->work); } -static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, - struct cma_work *work) +static int cma_query_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; @@ -2369,53 +2993,84 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, return (id_priv->query_id < 0) ? id_priv->query_id : 0; } -static void cma_work_handler(struct work_struct *_work) +static void cma_iboe_join_work_handler(struct work_struct *work) { - struct cma_work *work = container_of(_work, struct cma_work, work); - struct rdma_id_private *id_priv = work->id; - int destroy = 0; + struct cma_multicast *mc = + container_of(work, struct cma_multicast, iboe_join.work); + struct rdma_cm_event *event = &mc->iboe_join.event; + struct rdma_id_private *id_priv = mc->id_priv; + int ret; mutex_lock(&id_priv->handler_mutex); - if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) - goto out; + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; - if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; - } -out: + ret = cma_cm_event_handler(id_priv, event); + WARN_ON(ret); + +out_unlock: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); - kfree(work); + if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&event->param.ud.ah_attr); } -static void cma_ndev_work_handler(struct work_struct *_work) +static void cma_work_handler(struct work_struct *_work) { - struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); + struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; - int destroy = 0; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == RDMA_CM_DESTROYING || - id_priv->state == RDMA_CM_DEVICE_REMOVAL) - goto out; + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + if (work->old_state != 0 || work->new_state != 0) { + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out_unlock; + } - if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - destroy = 1; + if (cma_cm_event_handler(id_priv, &work->event)) { + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + goto out_free; } -out: +out_unlock: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - if (destroy) - rdma_destroy_id(&id_priv->id); + cma_id_put(id_priv); +out_free: + if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } -static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +static void cma_init_resolve_route_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; +} + +static void enqueue_resolve_addr_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + /* Balances with cma_id_put() in cma_work_handler */ + cma_id_get(id_priv); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + + queue_work(cma_wq, &work->work); +} + +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; @@ -2425,13 +3080,10 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); - route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) + route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; @@ -2450,10 +3102,62 @@ err1: return ret; } -int rdma_set_ib_paths(struct rdma_cm_id *id, - struct sa_path_rec *path_rec, int num_paths) +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + +/* + * cma_iboe_set_path_rec_l2_fields() is helper function which sets + * path record type based on GID type. + * It also sets up other L2 fields which includes destination mac address + * netdev ifindex, of the path record. + * It returns the netdev of the bound interface for this path record entry. + */ +static struct net_device * +cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; + struct rdma_addr *addr = &route->addr; + unsigned long supported_gids; + struct net_device *ndev; + + if (!addr->dev_addr.bound_dev_if) + return NULL; + + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); + if (!ndev) + return NULL; + + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + gid_type = cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + /* Use the hint from IP Stack to select GID Type */ + if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + gid_type = ib_network_to_gid_type(addr->dev_addr.network); + route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + + route->path_rec->roce.route_resolved = true; + sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); + return ndev; +} + +int rdma_set_ib_path(struct rdma_cm_id *id, + struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; + struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); @@ -2461,22 +3165,35 @@ int rdma_set_ib_paths(struct rdma_cm_id *id, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; - id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, + id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } - id->route.num_paths = num_paths; + if (rdma_protocol_roce(id->device, id->port_num)) { + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); + if (!ndev) { + ret = -ENODEV; + goto err_free; + } + dev_put(ndev); + } + + id->route.num_pri_alt_paths = 1; return 0; + +err_free: + kfree(id->route.path_rec); + id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } -EXPORT_SYMBOL(rdma_set_ib_paths); +EXPORT_SYMBOL(rdma_set_ib_path); -static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; @@ -2484,43 +3201,91 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } -static int iboe_tos_to_sl(struct net_device *ndev, int tos) +static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { - int prio; struct net_device *dev; - prio = rt_tos2priority(tos); - dev = is_vlan_dev(ndev) ? vlan_dev_real_dev(ndev) : ndev; + dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); -#if IS_ENABLED(CONFIG_VLAN_8021Q) + return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +} + +struct iboe_prio_tc_map { + int input_prio; + int output_tc; + bool found; +}; + +static int get_lower_vlan_dev_tc(struct net_device *dev, + struct netdev_nested_priv *priv) +{ + struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; + + if (is_vlan_dev(dev)) + map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); + else if (dev->num_tc) + map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); + else + map->output_tc = 0; + /* We are interested only in first level VLAN device, so always + * return 1 to stop iterating over next level devices. + */ + map->found = true; + return 1; +} + +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + struct iboe_prio_tc_map prio_tc_map = {}; + int prio = rt_tos2priority(tos); + struct netdev_nested_priv priv; + + /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) - return (vlan_dev_get_egress_qos_mask(ndev, prio) & - VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; -#endif - return 0; + return get_vlan_ndev_tc(ndev, prio); + + prio_tc_map.input_prio = prio; + priv.data = (void *)&prio_tc_map; + rcu_read_lock(); + netdev_walk_all_lower_dev_rcu(ndev, + get_lower_vlan_dev_tc, + &priv); + rcu_read_unlock(); + /* If map is found from lower device, use it; Otherwise + * continue with the current netdevice to get priority to tc map. + */ + if (prio_tc_map.found) + return prio_tc_map.output_tc; + else if (ndev->num_tc) + return netdev_get_prio_tc_map(ndev, prio); + else + return 0; } -static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, - unsigned long supported_gids, - enum ib_gid_type default_gid) +static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { - if ((network_type == RDMA_NETWORK_IPV4 || - network_type == RDMA_NETWORK_IPV6) && - test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) - return IB_GID_TYPE_ROCE_UDP_ENCAP; + struct sockaddr_in6 *addr6; + u16 dport, sport; + u32 hash, fl; - return default_gid; + addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); + fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; + if ((cma_family(id_priv) != AF_INET6) || !fl) { + dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); + sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); + hash = (u32)sport * 31 + dport; + fl = hash & IB_GRH_FLOWLABEL_MASK; + } + + return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) @@ -2529,64 +3294,39 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct net_device *ndev = NULL; - enum ib_gid_type gid_type = IB_GID_TYPE_IB; + struct net_device *ndev; + u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; - u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; + u8 tos; + mutex_lock(&id_priv->qp_mutex); + tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; + mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } - route->num_paths = 1; - - if (addr->dev_addr.bound_dev_if) { - unsigned long supported_gids; + route->num_pri_alt_paths = 1; - ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); - if (!ndev) { - ret = -ENODEV; - goto err2; - } - - supported_gids = roce_gid_type_mask_support(id_priv->id.device, - id_priv->id.port_num); - gid_type = cma_route_gid_type(addr->dev_addr.network, - supported_gids, - id_priv->gid_type); - route->path_rec->rec_type = - sa_conv_gid_to_pathrec_type(gid_type); - sa_path_set_ndev(route->path_rec, &init_net); - sa_path_set_ifindex(route->path_rec, ndev->ifindex); - } + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } - sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - /* Use the hint from IP Stack to select GID Type */ - if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) - gid_type = ib_network_to_gid_type(addr->dev_addr.network); - route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); - if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; @@ -2599,20 +3339,34 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; - route->path_rec->rate = iboe_get_rate(ndev); + route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; - route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + /* In case ACK timeout is set, use this value to calculate + * PacketLifeTime. As per IBTA 12.7.34, + * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). + * Assuming a negligible local ACK delay, we can use + * PacketLifeTime = local ACK timeout/2 + * as a reasonable approximation for RoCE networks. + */ + mutex_lock(&id_priv->qp_mutex); + if (id_priv->timeout_set && id_priv->timeout) + route->path_rec->packet_life_time = id_priv->timeout - 1; + else + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + mutex_unlock(&id_priv->qp_mutex); + if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; - work->event.status = 0; + if (rdma_protocol_roce_udp_encap(id_priv->id.device, + id_priv->id.port_num)) + route->path_rec->flow_label = + cma_get_roce_udp_flow_label(id_priv); + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; @@ -2620,27 +3374,39 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) err2: kfree(route->path_rec); route->path_rec = NULL; + route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; + enum rdma_cm_state state; int ret; + if (!timeout_ms) + return -EINVAL; + id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) + state = id_priv->state; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY) && + !cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_RESOLVED, + RDMA_CM_ROUTE_QUERY)) return -EINVAL; - atomic_inc(&id_priv->refcount); + cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); - else if (rdma_protocol_roce(id->device, id->port_num)) + else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); + if (!ret) + cma_add_id_to_tree(id_priv); + } else if (rdma_protocol_iwarp(id->device, id->port_num)) - ret = cma_resolve_iw_route(id_priv, timeout_ms); + ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; @@ -2649,8 +3415,8 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) return 0; err: - cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); - cma_deref_id(id_priv); + cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, state); + cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); @@ -2677,9 +3443,9 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; + unsigned int p; u16 pkey; int ret; - u8 p; cma_dev = NULL; mutex_lock(&lock); @@ -2691,7 +3457,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) if (!cma_dev) cma_dev = cur_dev; - for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; @@ -2708,7 +3474,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) p = 1; port_found: - ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL); + ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; @@ -2724,6 +3490,7 @@ port_found: ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); @@ -2734,25 +3501,36 @@ static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; + struct sockaddr *addr; + struct sockaddr_storage old_addr; - memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; - memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); + /* + * Store the previous src address, so that if we fail to acquire + * matching rdma device, old address can be restored back, which helps + * to cancel the cma listen operation correctly. + */ + addr = cma_src_addr(id_priv); + memcpy(&old_addr, addr, rdma_addr_size(addr)); + memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { - status = cma_acquire_dev(id_priv, NULL); + status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); - } else { + rdma_restrack_add(&id_priv->res); + } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { + memcpy(addr, &old_addr, + rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; @@ -2761,16 +3539,12 @@ static void addr_handler(int status, struct sockaddr *src_addr, } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; - if (id_priv->id.event_handler(&id_priv->id, &event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); - rdma_destroy_id(&id_priv->id); + if (cma_cm_event_handler(id_priv, &event)) { + destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) @@ -2792,12 +3566,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; - queue_work(cma_wq, &work->work); + enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); @@ -2822,88 +3591,13 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; - queue_work(cma_wq, &work->work); + enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } -static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - struct sockaddr *dst_addr) -{ - if (!src_addr || !src_addr->sa_family) { - src_addr = (struct sockaddr *) &id->route.addr.src_addr; - src_addr->sa_family = dst_addr->sa_family; - if (IS_ENABLED(CONFIG_IPV6) && - dst_addr->sa_family == AF_INET6) { - struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; - struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; - src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; - if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; - } else if (dst_addr->sa_family == AF_IB) { - ((struct sockaddr_ib *) src_addr)->sib_pkey = - ((struct sockaddr_ib *) dst_addr)->sib_pkey; - } - } - return rdma_bind_addr(id, src_addr); -} - -int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - struct sockaddr *dst_addr, int timeout_ms) -{ - struct rdma_id_private *id_priv; - int ret; - - id_priv = container_of(id, struct rdma_id_private, id); - memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - if (id_priv->state == RDMA_CM_IDLE) { - ret = cma_bind_addr(id, src_addr, dst_addr); - if (ret) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return ret; - } - } - - if (cma_family(id_priv) != dst_addr->sa_family) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return -EINVAL; - } - - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); - return -EINVAL; - } - - atomic_inc(&id_priv->refcount); - if (cma_any_addr(dst_addr)) { - ret = cma_resolve_loopback(id_priv); - } else { - if (dst_addr->sa_family == AF_IB) { - ret = cma_resolve_ib_addr(id_priv); - } else { - ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); - } - } - if (ret) - goto err; - - return 0; -err: - cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - cma_deref_id(id_priv); - return ret; -} -EXPORT_SYMBOL(rdma_resolve_addr); - int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; @@ -2912,7 +3606,8 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (reuse || id_priv->state == RDMA_CM_IDLE) { + if ((reuse && id_priv->state != RDMA_CM_LISTEN) || + id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { @@ -2951,6 +3646,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, u64 sid, mask; __be16 port; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); port = htons(bind_list->port); @@ -2973,12 +3670,14 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, hlist_add_head(&id_priv->node, &bind_list->owners); } -static int cma_alloc_port(enum rdma_port_space ps, +static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; + lockdep_assert_held(&lock); + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; @@ -2989,7 +3688,7 @@ static int cma_alloc_port(enum rdma_port_space ps, goto err; bind_list->ps = ps; - bind_list->port = (unsigned short)ret; + bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: @@ -3005,6 +3704,8 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); + lockdep_assert_held(&lock); + hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); @@ -3014,7 +3715,8 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, continue; /* different dest port -> unique */ - if (!cma_any_port(cur_daddr) && + if (!cma_any_port(daddr) && + !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; @@ -3025,7 +3727,8 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, continue; /* different dst address -> unique */ - if (!cma_any_addr(cur_daddr) && + if (!cma_any_addr(daddr) && + !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; @@ -3034,7 +3737,7 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, return 0; } -static int cma_alloc_any_port(enum rdma_port_space ps, +static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; @@ -3042,9 +3745,11 @@ static int cma_alloc_any_port(enum rdma_port_space ps, unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; + lockdep_assert_held(&lock); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32() % remaining + low; + rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; @@ -3089,13 +3794,14 @@ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; - if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr && - cur_id->reuseaddr) + if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); @@ -3112,13 +3818,15 @@ static int cma_check_port(struct rdma_bind_list *bind_list, return 0; } -static int cma_use_port(enum rdma_port_space ps, +static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; + lockdep_assert_held(&lock); + snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; @@ -3134,20 +3842,8 @@ static int cma_use_port(enum rdma_port_space ps, return ret; } -static int cma_bind_listen(struct rdma_id_private *id_priv) -{ - struct rdma_bind_list *bind_list = id_priv->bind_list; - int ret = 0; - - mutex_lock(&lock); - if (bind_list->owners.first->next) - ret = cma_check_port(bind_list, id_priv, 0); - mutex_unlock(&lock); - return ret; -} - -static enum rdma_port_space cma_select_inet_ps( - struct rdma_id_private *id_priv) +static enum rdma_ucm_port_space +cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: @@ -3161,9 +3857,10 @@ static enum rdma_port_space cma_select_inet_ps( } } -static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) +static enum rdma_ucm_port_space +cma_select_ib_ps(struct rdma_id_private *id_priv) { - enum rdma_port_space ps = 0; + enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; @@ -3194,7 +3891,7 @@ static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) static int cma_get_port(struct rdma_id_private *id_priv) { - enum rdma_port_space ps; + enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) @@ -3238,28 +3935,41 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, int rdma_listen(struct rdma_cm_id *id, int backlog) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == RDMA_CM_IDLE) { - id->route.addr.src_addr.ss_family = AF_INET; - ret = rdma_bind_addr(id, cma_src_addr(id_priv)); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { + struct sockaddr_in any_in = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN))) + return -EINVAL; } - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) - return -EINVAL; - + /* + * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable + * any more, and has to be unique in the bind list. + */ if (id_priv->reuseaddr) { - ret = cma_bind_listen(id_priv); + mutex_lock(&lock); + ret = cma_check_port(id_priv->bind_list, id_priv, 0); + if (!ret) + id_priv->reuseaddr = 0; + mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; - if (id->device) { + if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) @@ -3272,42 +3982,48 @@ int rdma_listen(struct rdma_cm_id *id, int backlog) ret = -ENOSYS; goto err; } - } else - cma_listen_on_all(id_priv); + } else { + ret = cma_listen_on_all(id_priv); + if (ret) + goto err; + } return 0; err: id_priv->backlog = 0; + /* + * All the failure paths that lead here will not allow the req_handler's + * to have run. + */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); -int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, + struct sockaddr *addr, const struct sockaddr *daddr) { - struct rdma_id_private *id_priv; + struct sockaddr *id_daddr; int ret; - struct sockaddr *daddr; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; - id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; - ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = cma_translate_addr(addr, &id->route.addr.dev_addr); + ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; - ret = cma_acquire_dev(id_priv, NULL); + ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } @@ -3323,13 +4039,17 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) } #endif } + id_daddr = cma_dst_addr(id_priv); + if (daddr != id_daddr) + memcpy(id_daddr, daddr, rdma_addr_size(addr)); + id_daddr->sa_family = addr->sa_family; + ret = cma_get_port(id_priv); if (ret) goto err2; - daddr = cma_dst_addr(id_priv); - daddr->sa_family = addr->sa_family; - + if (!cma_any_addr(addr)) + rdma_restrack_add(&id_priv->res); return 0; err2: if (id_priv->cma_dev) @@ -3338,6 +4058,129 @@ err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + struct sockaddr_storage zero_sock = {}; + + if (src_addr && src_addr->sa_family) + return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); + + /* + * When the src_addr is not specified, automatically supply an any addr + */ + zero_sock.ss_family = dst_addr->sa_family; + if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&zero_sock; + struct sockaddr_in6 *dst_addr6 = + (struct sockaddr_in6 *)dst_addr; + + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = + dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *)&zero_sock)->sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; + } + return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); +} + +/* + * If required, resolve the source address for bind and leave the id_priv in + * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior + * calls made by ULP, a previously bound ID will not be re-bound and src_addr is + * ignored. + */ +static int resolve_prepare_src(struct rdma_id_private *id_priv, + struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + int ret; + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); + if (ret) + return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_ADDR_QUERY))) + return -EINVAL; + + } else { + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + } + + if (cma_family(id_priv) != dst_addr->sa_family) { + ret = -EINVAL; + goto err_state; + } + return 0; + +err_state: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + ret = resolve_prepare_src(id_priv, src_addr, dst_addr); + if (ret) + return ret; + + if (cma_any_addr(dst_addr)) { + ret = cma_resolve_loopback(id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); + } + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_addr); + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); +} EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) @@ -3371,18 +4214,18 @@ static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *ib_event) + const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; - struct rdma_cm_event event; - struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; - int ret = 0; + struct rdma_cm_event event = {}; + const struct ib_cm_sidr_rep_event_param *rep = + &ib_event->param.sidr_rep_rcvd; + int ret; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_CONNECT) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; - memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; @@ -3405,9 +4248,11 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = ret; break; } - ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, - id_priv->id.route.path_rec, - &event.param.ud.ah_attr); + ib_init_ah_attr_from_path(id_priv->id.device, + id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr, + rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; @@ -3419,18 +4264,18 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, goto out; } - ret = id_priv->id.event_handler(&id_priv->id, &event); + ret = cma_cm_event_handler(id_priv, &event); + + rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); + destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); - return ret; + return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, @@ -3439,12 +4284,12 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; - int offset, ret; + u8 offset; + int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); - req.private_data_len = offset + conn_param->private_data_len; - if (req.private_data_len < conn_param->private_data_len) + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { @@ -3475,10 +4320,12 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; + req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; + trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); @@ -3496,12 +4343,12 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_route *route; void *private_data; struct ib_cm_id *id; - int offset, ret; + u8 offset; + int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); - req.private_data_len = offset + conn_param->private_data_len; - if (req.private_data_len < conn_param->private_data_len) + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { @@ -3532,9 +4379,13 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, } req.primary_path = &route->path_rec[0]; - if (route->num_paths == 2) + req.primary_path_inbound = route->path_rec_inbound; + req.primary_path_outbound = route->path_rec_outbound; + if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; + req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; + /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; @@ -3548,7 +4399,10 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; + req.ece.vendor_id = id_priv->ece.vendor_id; + req.ece.attr_mod = id_priv->ece.attr_mod; + trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { @@ -3571,7 +4425,11 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, if (IS_ERR(cm_id)) return PTR_ERR(cm_id); + mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; + cm_id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), @@ -3602,12 +4460,23 @@ out: return ret; } -int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +/** + * rdma_connect_locked - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Same as rdma_connect() but can only be called from the + * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. + */ +int rdma_connect_locked(struct rdma_cm_id *id, + struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); + lockdep_assert_held(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; @@ -3621,20 +4490,66 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); - } else if (rdma_cap_iw_cm(id->device, id->port_num)) + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); - else + } else { ret = -ENOSYS; + } if (ret) - goto err; - + goto err_state; return 0; -err: +err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } +EXPORT_SYMBOL(rdma_connect_locked); + +/** + * rdma_connect - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Users must have resolved a route for the rdma_cm_id to connect with by having + * called rdma_resolve_route before calling this routine. + * + * This call will either connect to a remote QP or obtain remote QP information + * for unconnected rdma_cm_id's. The actual operation is based on the + * rdma_cm_id's port space. + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + mutex_lock(&id_priv->handler_mutex); + ret = rdma_connect_locked(id, conn_param); + mutex_unlock(&id_priv->handler_mutex); + return ret; +} EXPORT_SYMBOL(rdma_connect); +/** + * rdma_connect_ece - Initiate an active connection request with ECE data. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * @ece: ECE parameters + * + * See rdma_connect() explanation. + */ +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_connect(id, conn_param); +} +EXPORT_SYMBOL(rdma_connect_ece); + static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { @@ -3660,7 +4575,10 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; + trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; @@ -3683,9 +4601,9 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; - if (id_priv->id.qp) { + if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; - } else + else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); @@ -3701,28 +4619,53 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv, qkey); + if (qkey) + ret = cma_set_qkey(id_priv, qkey); + else + ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; + + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; } + rep.private_data = private_data; rep.private_data_len = private_data_len; + trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + * + * In the case of error, a reject message is sent to the remote side and the + * state of the qp associated with the id is modified to error, such that any + * previously posted receive buffers would be flushed. + * + * This function is for use by kernel ULPs and must be called from under the + * handler callback. + */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); int ret; - id_priv = container_of(id, struct rdma_id_private, id); - - id_priv->owner = task_pid_nr(current); + lockdep_assert_held(&id_priv->handler_mutex); - if (!cma_comp(id_priv, RDMA_CM_CONNECT)) + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { @@ -3746,22 +4689,53 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) else ret = cma_rep_recv(id_priv); } - } else if (rdma_cap_iw_cm(id->device, id->port_num)) + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); - else + } else { ret = -ENOSYS; - + } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); - rdma_reject(id, NULL, 0); + rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(rdma_accept); +int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_accept(id, conn_param); +} +EXPORT_SYMBOL(rdma_accept_ece); + +void rdma_lock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_lock_handler); + +void rdma_unlock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_unlock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_unlock_handler); + int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; @@ -3784,7 +4758,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, - u8 private_data_len) + u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; @@ -3794,18 +4768,20 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { - if (id->qp_type == IB_QPT_UD) + if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); - else - ret = ib_send_cm_rej(id_priv->cm_id.ib, - IB_CM_REJ_CONSUMER_DEFINED, NULL, - 0, private_data, private_data_len); + } else { + trace_cm_send_rej(id_priv); + ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, + private_data, private_data_len); + } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); - } else + } else { ret = -ENOSYS; + } return ret; } @@ -3825,8 +4801,13 @@ int rdma_disconnect(struct rdma_cm_id *id) if (ret) goto out; /* Initiate or respond to a disconnect. */ - if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) - ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + trace_cm_disconnect(id_priv); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { + if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) + trace_cm_sent_drep(id_priv); + } else { + trace_cm_sent_dreq(id_priv); + } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else @@ -3837,65 +4818,68 @@ out: } EXPORT_SYMBOL(rdma_disconnect); +static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, + struct ib_sa_multicast *multicast, + struct rdma_cm_event *event, + struct cma_multicast *mc) +{ + struct rdma_dev_addr *dev_addr; + enum ib_gid_type gid_type; + struct net_device *ndev; + + if (status) + pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", + status); + + event->status = status; + event->param.ud.private_data = mc->context; + if (status) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + return; + } + + dev_addr = &id_priv->id.route.addr.dev_addr; + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + gid_type = + id_priv->cma_dev + ->default_gid_type[id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + + event->event = RDMA_CM_EVENT_MULTICAST_JOIN; + if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, + &multicast->rec, ndev, gid_type, + &event->param.ud.ah_attr)) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + goto out; + } + + event->param.ud.qp_num = 0xFFFFFF; + event->param.ud.qkey = id_priv->qkey; + +out: + dev_put(ndev); +} + static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { - struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; - struct rdma_cm_event event; + struct rdma_id_private *id_priv = mc->id_priv; + struct rdma_cm_event event = {}; int ret = 0; - id_priv = mc->id_priv; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != RDMA_CM_ADDR_BOUND && - id_priv->state != RDMA_CM_ADDR_RESOLVED) + if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || + READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; - if (!status) - status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); - else - pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", - status); - mutex_lock(&id_priv->qp_mutex); - if (!status && id_priv->id.qp) { - status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, - be16_to_cpu(multicast->rec.mlid)); - if (status) - pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to attach QP. status %d\n", - status); - } - mutex_unlock(&id_priv->qp_mutex); - - memset(&event, 0, sizeof event); - event.status = status; - event.param.ud.private_data = mc->context; - if (!status) { - struct rdma_dev_addr *dev_addr = - &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = - dev_get_by_index(&init_net, dev_addr->bound_dev_if); - enum ib_gid_type gid_type = - id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; - - event.event = RDMA_CM_EVENT_MULTICAST_JOIN; - ib_init_ah_from_mcmember(id_priv->id.device, - id_priv->id.port_num, &multicast->rec, - ndev, gid_type, - &event.param.ud.ah_attr); - event.param.ud.qp_num = 0xFFFFFF; - event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - if (ndev) - dev_put(ndev); - } else - event.event = RDMA_CM_EVENT_MULTICAST_ERROR; - - ret = id_priv->id.event_handler(&id_priv->id, &event); - if (ret) { - cma_exch(id_priv, RDMA_CM_DESTROYING); - mutex_unlock(&id_priv->handler_mutex); - rdma_destroy_id(&id_priv->id); - return 0; + ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); + if (!ret) { + cma_make_mc_event(status, id_priv, multicast, &event, mc); + ret = cma_cm_event_handler(id_priv, &event); } + rdma_destroy_ah_attr(&event.param.ud.ah_attr); + WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); @@ -3919,7 +4903,7 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); - } else if ((addr->sa_family == AF_INET6)) { + } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ @@ -3946,9 +4930,11 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, if (ret) return ret; - ret = cma_set_qkey(id_priv, 0); - if (ret) - return ret; + if (!id_priv->qkey) { + ret = cma_set_default_qkey(id_priv); + if (ret) + return ret; + } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); @@ -3956,16 +4942,6 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; - if ((rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) && - (!ib_sa_sendonly_fullmem_support(&sa_client, - id_priv->id.device, - id_priv->id.port_num))) { - pr_warn("RDMA CM: %s port %u Unable to multicast join\n" - "RDMA CM: SM doesn't support Send Only Full Member option\n", - id_priv->id.device->name, id_priv->id.port_num); - return -EOPNOTSUPP; - } - comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | @@ -3979,26 +4955,14 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; - mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, - id_priv->id.port_num, &rec, - comp_mask, GFP_KERNEL, - cma_ib_mc_handler, mc); - return PTR_ERR_OR_ZERO(mc->multicast.ib); -} - -static void iboe_mcast_work_handler(struct work_struct *work) -{ - struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); - struct cma_multicast *mc = mw->mc; - struct ib_sa_multicast *m = mc->multicast.ib; - - mc->multicast.ib->context = mc; - cma_ib_mc_handler(0, m); - kref_put(&mc->mcref, release_mc); - kfree(mw); + mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, + id_priv->id.port_num, &rec, comp_mask, + GFP_KERNEL, cma_ib_mc_handler, mc); + return PTR_ERR_OR_ZERO(mc->sa_mc); } -static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; @@ -4008,8 +4972,10 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = 0xff; - mgid->raw[1] = 0x0e; + mgid->raw[0] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; + mgid->raw[1] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; @@ -4027,55 +4993,39 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { - struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + struct ib_sa_multicast ib = {}; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); - if (cma_zero_addr((struct sockaddr *)&mc->addr)) + if (cma_zero_addr(addr)) return -EINVAL; - work = kzalloc(sizeof *work, GFP_KERNEL); - if (!work) - return -ENOMEM; - - mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); - if (!mc->multicast.ib) { - err = -ENOMEM; - goto out1; - } - - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); - - mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); - if (id_priv->id.ps == RDMA_PS_UDP) - mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); + ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (!ndev) { - err = -ENODEV; - goto out2; - } - mc->multicast.ib->rec.rate = iboe_get_rate(ndev); - mc->multicast.ib->rec.hop_limit = 1; - mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (!ndev) + return -ENODEV; + + ib.rec.rate = IB_RATE_PORT_CURRENT; + ib.rec.hop_limit = 1; + ib.rec.mtu = iboe_get_mtu(ndev->mtu); - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { - mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { - err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + err = cma_igmp_send(ndev, &ib.rec.mgid, true); - if (!err) - mc->igmp_joined = true; } } } else { @@ -4083,67 +5033,69 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -ENOTSUPP; } dev_put(ndev); - if (err || !mc->multicast.ib->rec.mtu) { - if (!err) - err = -EINVAL; - goto out2; - } - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, - &mc->multicast.ib->rec.port_gid); - work->id = id_priv; - work->mc = mc; - INIT_WORK(&work->work, iboe_mcast_work_handler); - kref_get(&mc->mcref); - queue_work(cma_wq, &work->work); + if (err || !ib.rec.mtu) + return err ?: -EINVAL; - return 0; + if (!id_priv->qkey) + cma_set_default_qkey(id_priv); -out2: - kfree(mc->multicast.ib); -out1: - kfree(work); - return err; + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &ib.rec.port_gid); + INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); + cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); + queue_work(cma_wq, &mc->iboe_join.work); + return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { - struct rdma_id_private *id_priv; + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; - id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && - !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) + /* Not supported for kernel QPs */ + if (WARN_ON(id->qp)) + return -EINVAL; + + /* ULP is calling this wrong. */ + if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && + READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) + return -EINVAL; + + if (id_priv->id.qp_type != IB_QPT_UD) return -EINVAL; - mc = kmalloc(sizeof *mc, GFP_KERNEL); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - mc->igmp_joined = false; mc->join_state = join_state; - spin_lock(&id_priv->lock); - list_add(&mc->list, &id_priv->mc_list); - spin_unlock(&id_priv->lock); if (rdma_protocol_roce(id->device, id->port_num)) { - kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); - } else if (rdma_cap_ib_mcast(id->device, id->port_num)) + if (ret) + goto out_err; + } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); - else + if (ret) + goto out_err; + } else { ret = -ENOSYS; - - if (ret) { - spin_lock_irq(&id_priv->lock); - list_del(&mc->list); - spin_unlock_irq(&id_priv->lock); - kfree(mc); + goto out_err; } + + spin_lock(&id_priv->lock); + list_add(&mc->list, &id_priv->mc_list); + spin_unlock(&id_priv->lock); + + return 0; +out_err: + kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); @@ -4156,41 +5108,14 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { - list_del(&mc->list); - spin_unlock_irq(&id_priv->lock); - - if (id->qp) - ib_detach_mcast(id->qp, - &mc->multicast.ib->rec.mgid, - be16_to_cpu(mc->multicast.ib->rec.mlid)); - - BUG_ON(id_priv->cma_dev->device != id->device); - - if (rdma_cap_ib_mcast(id->device, id->port_num)) { - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); - } else if (rdma_protocol_roce(id->device, id->port_num)) { - if (mc->igmp_joined) { - struct rdma_dev_addr *dev_addr = - &id->route.addr.dev_addr; - struct net_device *ndev = NULL; - - if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, - dev_addr->bound_dev_if); - if (ndev) { - cma_igmp_send(ndev, - &mc->multicast.ib->rec.mgid, - false); - dev_put(ndev); - } - mc->igmp_joined = false; - } - kref_put(&mc->mcref, release_mc); - } - return; - } + if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) + continue; + list_del(&mc->list); + spin_unlock_irq(&id_priv->lock); + + WARN_ON(id_priv->cma_dev->device != id->device); + destroy_mc(id_priv, mc); + return; } spin_unlock_irq(&id_priv->lock); } @@ -4199,7 +5124,7 @@ EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; - struct cma_ndev_work *work; + struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; @@ -4212,10 +5137,10 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id if (!work) return -ENOMEM; - INIT_WORK(&work->work, cma_ndev_work_handler); + INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; - atomic_inc(&id_priv->refcount); + cma_id_get(id_priv); queue_work(cma_wq, &work->work); } @@ -4233,12 +5158,12 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; - if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) + if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) - list_for_each_entry(id_priv, &cma_dev->id_list, list) { + list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; @@ -4249,124 +5174,242 @@ out: return ret; } +static void cma_netevent_work_handler(struct work_struct *_work) +{ + struct rdma_id_private *id_priv = + container_of(_work, struct rdma_id_private, id.net_work); + struct rdma_cm_event event = {}; + + mutex_lock(&id_priv->handler_mutex); + + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + + if (cma_cm_event_handler(id_priv, &event)) { + __acquire(&id_priv->handler_mutex); + id_priv->cm_id.ib = NULL; + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + return; + } + +out_unlock: + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); +} + +static int cma_netevent_callback(struct notifier_block *self, + unsigned long event, void *ctx) +{ + struct id_table_entry *ips_node = NULL; + struct rdma_id_private *current_id; + struct neighbour *neigh = ctx; + unsigned long flags; + + if (event != NETEVENT_NEIGH_UPDATE) + return NOTIFY_DONE; + + spin_lock_irqsave(&id_table_lock, flags); + if (neigh->tbl->family == AF_INET6) { + struct sockaddr_in6 neigh_sock_6; + + neigh_sock_6.sin6_family = AF_INET6; + neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_6); + } else if (neigh->tbl->family == AF_INET) { + struct sockaddr_in neigh_sock_4; + + neigh_sock_4.sin_family = AF_INET; + neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_4); + } else + goto out; + + if (!ips_node) + goto out; + + list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { + if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, + neigh->ha, ETH_ALEN)) + continue; + cma_id_get(current_id); + if (!queue_work(cma_wq, ¤t_id->id.net_work)) + cma_id_put(current_id); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); + return NOTIFY_DONE; +} + static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; -static void cma_add_one(struct ib_device *device) +static struct notifier_block cma_netevent_cb = { + .notifier_call = cma_netevent_callback +}; + +static void cma_send_device_removal_put(struct rdma_id_private *id_priv) +{ + struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; + enum rdma_cm_state state; + unsigned long flags; + + mutex_lock(&id_priv->handler_mutex); + /* Record that we want to remove the device */ + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); + return; + } + id_priv->state = RDMA_CM_DEVICE_REMOVAL; + spin_unlock_irqrestore(&id_priv->lock, flags); + + if (cma_cm_event_handler(id_priv, &event)) { + /* + * At this point the ULP promises it won't call + * rdma_destroy_id() concurrently + */ + cma_id_put(id_priv); + mutex_unlock(&id_priv->handler_mutex); + trace_cm_id_destroy(id_priv); + _destroy_id(id_priv, state); + return; + } + mutex_unlock(&id_priv->handler_mutex); + + /* + * If this races with destroy then the thread that first assigns state + * to a destroying does the cancel. + */ + cma_cancel_operation(id_priv, state); + cma_id_put(id_priv); +} + +static void cma_process_remove(struct cma_device *cma_dev) +{ + mutex_lock(&lock); + while (!list_empty(&cma_dev->id_list)) { + struct rdma_id_private *id_priv = list_first_entry( + &cma_dev->id_list, struct rdma_id_private, device_item); + + list_del_init(&id_priv->listen_item); + list_del_init(&id_priv->device_item); + cma_id_get(id_priv); + mutex_unlock(&lock); + + cma_send_device_removal_put(id_priv); + + mutex_lock(&lock); + } + mutex_unlock(&lock); + + cma_dev_put(cma_dev); + wait_for_completion(&cma_dev->comp); +} + +static bool cma_supported(struct ib_device *device) { + u32 i; + + rdma_for_each_port(device, i) { + if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) + return true; + } + return false; +} + +static int cma_add_one(struct ib_device *device) +{ + struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; - unsigned int i; unsigned long supported_gids = 0; + int ret; + u32 i; + + if (!cma_supported(device)) + return -EOPNOTSUPP; - cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); + cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) - return; + return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); - if (!cma_dev->default_gid_type) + if (!cma_dev->default_gid_type) { + ret = -ENOMEM; goto free_cma_dev; + } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); - if (!cma_dev->default_roce_tos) + if (!cma_dev->default_roce_tos) { + ret = -ENOMEM; goto free_gid_type; + } - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); - cma_dev->default_gid_type[i - rdma_start_port(device)] = - find_first_bit(&supported_gids, BITS_PER_LONG); + if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) + cma_dev->default_gid_type[i - rdma_start_port(device)] = + CMA_PREFERRED_ROCE_GID_TYPE; + else + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); - atomic_set(&cma_dev->refcount, 1); + refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); - list_for_each_entry(id_priv, &listen_any_list, list) - cma_listen_on_dev(id_priv, cma_dev); + list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) + goto free_listen; + } mutex_unlock(&lock); - return; + trace_cm_add_one(device); + return 0; + +free_listen: + list_del(&cma_dev->list); + mutex_unlock(&lock); + /* cma_process_remove() will delete to_destroy */ + cma_process_remove(cma_dev); + kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); - - return; -} - -static int cma_remove_id_dev(struct rdma_id_private *id_priv) -{ - struct rdma_cm_event event; - enum rdma_cm_state state; - int ret = 0; - - /* Record that we want to remove the device */ - state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); - if (state == RDMA_CM_DESTROYING) - return 0; - - cma_cancel_operation(id_priv, state); - mutex_lock(&id_priv->handler_mutex); - - /* Check for destruction from another callback. */ - if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) - goto out; - - memset(&event, 0, sizeof event); - event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; - ret = id_priv->id.event_handler(&id_priv->id, &event); -out: - mutex_unlock(&id_priv->handler_mutex); return ret; } -static void cma_process_remove(struct cma_device *cma_dev) -{ - struct rdma_id_private *id_priv; - int ret; - - mutex_lock(&lock); - while (!list_empty(&cma_dev->id_list)) { - id_priv = list_entry(cma_dev->id_list.next, - struct rdma_id_private, list); - - list_del(&id_priv->listen_list); - list_del_init(&id_priv->list); - atomic_inc(&id_priv->refcount); - mutex_unlock(&lock); - - ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); - cma_deref_id(id_priv); - if (ret) - rdma_destroy_id(&id_priv->id); - - mutex_lock(&lock); - } - mutex_unlock(&lock); - - cma_deref_dev(cma_dev); - wait_for_completion(&cma_dev->comp); -} - static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; - if (!cma_dev) - return; + trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); @@ -4378,93 +5421,14 @@ static void cma_remove_one(struct ib_device *device, void *client_data) kfree(cma_dev); } -static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct nlmsghdr *nlh; - struct rdma_cm_id_stats *id_stats; - struct rdma_id_private *id_priv; - struct rdma_cm_id *id = NULL; - struct cma_device *cma_dev; - int i_dev = 0, i_id = 0; - - /* - * We export all of the IDs as a sequence of messages. Each - * ID gets its own netlink message. - */ - mutex_lock(&lock); - - list_for_each_entry(cma_dev, &dev_list, list) { - if (i_dev < cb->args[0]) { - i_dev++; - continue; - } - - i_id = 0; - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - if (i_id < cb->args[1]) { - i_id++; - continue; - } - - id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, - sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS, - NLM_F_MULTI); - if (!id_stats) - goto out; - - memset(id_stats, 0, sizeof *id_stats); - id = &id_priv->id; - id_stats->node_type = id->route.addr.dev_addr.dev_type; - id_stats->port_num = id->port_num; - id_stats->bound_dev_if = - id->route.addr.dev_addr.bound_dev_if; - - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), - cma_src_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) - goto out; - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), - cma_dst_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) - goto out; - - id_stats->pid = id_priv->owner; - id_stats->port_space = id->ps; - id_stats->cm_state = id_priv->state; - id_stats->qp_num = id_priv->qp_num; - id_stats->qp_type = id->qp_type; - - i_id++; - } - - cb->args[1] = 0; - i_dev++; - } - -out: - mutex_unlock(&lock); - cb->args[0] = i_dev; - cb->args[1] = i_id; - - return skb->len; -} - -static const struct ibnl_client_cbs cma_cb_table[] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats, - .module = THIS_MODULE }, -}; - static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); - idr_init(&pernet->tcp_ps); - idr_init(&pernet->udp_ps); - idr_init(&pernet->ipoib_ps); - idr_init(&pernet->ib_ps); + xa_init(&pernet->tcp_ps); + xa_init(&pernet->udp_ps); + xa_init(&pernet->ipoib_ps); + xa_init(&pernet->ib_ps); return 0; } @@ -4473,10 +5437,10 @@ static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); - idr_destroy(&pernet->tcp_ps); - idr_destroy(&pernet->udp_ps); - idr_destroy(&pernet->ipoib_ps); - idr_destroy(&pernet->ib_ps); + WARN_ON(!xa_empty(&pernet->tcp_ps)); + WARN_ON(!xa_empty(&pernet->udp_ps)); + WARN_ON(!xa_empty(&pernet->ipoib_ps)); + WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { @@ -4490,6 +5454,19 @@ static int __init cma_init(void) { int ret; + /* + * There is a rare lock ordering dependency in cma_netdev_callback() + * that only happens when bonding is enabled. Teach lockdep that rtnl + * must never be nested under lock so it can find these without having + * to test with bonding. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + rtnl_lock(); + mutex_lock(&lock); + mutex_unlock(&lock); + rtnl_unlock(); + } + cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; @@ -4499,24 +5476,26 @@ static int __init cma_init(void) goto err_wq; ib_sa_register_client(&sa_client); - rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); + register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), - cma_cb_table)) - pr_warn("RDMA CMA: failed to add netlink callback\n"); - cma_configfs_init(); + ret = cma_configfs_init(); + if (ret) + goto err_ib; return 0; +err_ib: + ib_unregister_client(&cma_client); err: + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); - rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); + unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; @@ -4525,10 +5504,9 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - ibnl_remove_client(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); - rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); @@ -4536,3 +5514,129 @@ static void __exit cma_cleanup(void) module_init(cma_init); module_exit(cma_cleanup); + +static void cma_query_ib_service_handler(int status, + struct sa_service_rec *recs, + unsigned int num_recs, void *context) +{ + struct cma_work *work = context; + struct rdma_id_private *id_priv = work->id; + struct sockaddr_ib *addr; + + if (status) + goto fail; + + if (!num_recs) { + status = -ENOENT; + goto fail; + } + + if (id_priv->id.route.service_recs) { + status = -EALREADY; + goto fail; + } + + id_priv->id.route.service_recs = + kmalloc_array(num_recs, sizeof(*recs), GFP_KERNEL); + if (!id_priv->id.route.service_recs) { + status = -ENOMEM; + goto fail; + } + + id_priv->id.route.num_service_recs = num_recs; + memcpy(id_priv->id.route.service_recs, recs, sizeof(*recs) * num_recs); + + addr = (struct sockaddr_ib *)&id_priv->id.route.addr.dst_addr; + addr->sib_family = AF_IB; + addr->sib_addr = *(struct ib_addr *)&recs->gid; + addr->sib_pkey = recs->pkey; + addr->sib_sid = recs->id; + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, + (union ib_gid *)&addr->sib_addr); + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, + ntohs(addr->sib_pkey)); + + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDR_BOUND; + work->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR; + work->event.status = status; + pr_debug_ratelimited( + "RDMA CM: SERVICE_ERROR: failed to query service record. status %d\n", + status); + queue_work(cma_wq, &work->work); +} + +static int cma_resolve_ib_service(struct rdma_id_private *id_priv, + struct rdma_ucm_ib_service *ibs) +{ + struct sa_service_rec sr = {}; + ib_sa_comp_mask mask = 0; + struct cma_work *work; + + work = kzalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return -ENOMEM; + + cma_id_get(id_priv); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDRINFO_QUERY; + work->new_state = RDMA_CM_ADDRINFO_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED; + + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_ID) { + sr.id = cpu_to_be64(ibs->service_id); + mask |= IB_SA_SERVICE_REC_SERVICE_ID; + } + if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_NAME) { + strscpy(sr.name, ibs->service_name, sizeof(sr.name)); + mask |= IB_SA_SERVICE_REC_SERVICE_NAME; + } + + id_priv->query_id = ib_sa_service_rec_get(&sa_client, + id_priv->id.device, + id_priv->id.port_num, + &sr, mask, + 2000, GFP_KERNEL, + cma_query_ib_service_handler, + work, &id_priv->query); + + if (id_priv->query_id < 0) { + cma_id_put(id_priv); + kfree(work); + return id_priv->query_id; + } + + return 0; +} + +int rdma_resolve_ib_service(struct rdma_cm_id *id, + struct rdma_ucm_ib_service *ibs) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cma_dev || + !cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDRINFO_QUERY)) + return -EINVAL; + + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_service(id_priv, ibs); + else + ret = -EOPNOTSUPP; + + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ib_service); |
