summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-09 09:02:46 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-09 09:02:46 -0700
commitdce45af5c2e9e85f22578f2f8065f225f5d11764 (patch)
treee01e7a294586c3074142fb485516ce718a1a82d2 /drivers/infiniband/core
parent055128ee008b00fba14e3638e7e84fc2cff8d77d (diff)
parentb79656ed44c6865e17bcd93472ec39488bcc4984 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe: "This has been a smaller cycle than normal. One new driver was accepted, which is unusual, and at least one more driver remains in review on the list. Summary: - Driver fixes for hns, hfi1, nes, rxe, i40iw, mlx5, cxgb4, vmw_pvrdma - Many patches from MatthewW converting radix tree and IDR users to use xarray - Introduction of tracepoints to the MAD layer - Build large SGLs at the start for DMA mapping and get the driver to split them - Generally clean SGL handling code throughout the subsystem - Support for restricting RDMA devices to net namespaces for containers - Progress to remove object allocation boilerplate code from drivers - Change in how the mlx5 driver shows representor ports linked to VFs - mlx5 uapi feature to access the on chip SW ICM memory - Add a new driver for 'EFA'. This is HW that supports user space packet processing through QPs in Amazon's cloud" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (186 commits) RDMA/ipoib: Allow user space differentiate between valid dev_port IB/core, ipoib: Do not overreact to SM LID change event RDMA/device: Don't fire uevent before device is fully initialized lib/scatterlist: Remove leftover from sg_page_iter comment RDMA/efa: Add driver to Kconfig/Makefile RDMA/efa: Add the efa module RDMA/efa: Add EFA verbs implementation RDMA/efa: Add common command handlers RDMA/efa: Implement functions that submit and complete admin commands RDMA/efa: Add the ABI definitions RDMA/efa: Add the com service API definitions RDMA/efa: Add the efa_com.h file RDMA/efa: Add the efa.h header file RDMA/efa: Add EFA device definitions RDMA: Add EFA related definitions RDMA/umem: Remove hugetlb flag RDMA/bnxt_re: Use core helpers to get aligned DMA address RDMA/i40iw: Use core helpers to get aligned DMA address within a supported page size RDMA/verbs: Add a DMA iterator to return aligned contiguous memory blocks RDMA/umem: Add API to find best driver supported page size in an MR ...
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/addr.c1
-rw-r--r--drivers/infiniband/core/cache.c145
-rw-r--r--drivers/infiniband/core/cm.c94
-rw-r--r--drivers/infiniband/core/cm_msgs.h22
-rw-r--r--drivers/infiniband/core/cma.c83
-rw-r--r--drivers/infiniband/core/core_priv.h18
-rw-r--r--drivers/infiniband/core/cq.c21
-rw-r--r--drivers/infiniband/core/device.c632
-rw-r--r--drivers/infiniband/core/iwcm.c35
-rw-r--r--drivers/infiniband/core/mad.c87
-rw-r--r--drivers/infiniband/core/mad_priv.h4
-rw-r--r--drivers/infiniband/core/multicast.c1
-rw-r--r--drivers/infiniband/core/nldev.c112
-rw-r--r--drivers/infiniband/core/rdma_core.c200
-rw-r--r--drivers/infiniband/core/rdma_core.h11
-rw-r--r--drivers/infiniband/core/sa_query.c44
-rw-r--r--drivers/infiniband/core/sysfs.c93
-rw-r--r--drivers/infiniband/core/ucm.c35
-rw-r--r--drivers/infiniband/core/umem.c179
-rw-r--r--drivers/infiniband/core/umem_odp.c20
-rw-r--r--drivers/infiniband/core/user_mad.c22
-rw-r--r--drivers/infiniband/core/uverbs.h7
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c99
-rw-r--r--drivers/infiniband/core/uverbs_ioctl.c29
-rw-r--r--drivers/infiniband/core/uverbs_main.c69
-rw-r--r--drivers/infiniband/core/uverbs_std_types.c52
-rw-r--r--drivers/infiniband/core/uverbs_std_types_counters.c6
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c12
-rw-r--r--drivers/infiniband/core/uverbs_std_types_dm.c10
-rw-r--r--drivers/infiniband/core/uverbs_std_types_flow_action.c6
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c9
-rw-r--r--drivers/infiniband/core/verbs.c233
32 files changed, 1679 insertions, 712 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 744b6ec0acb0..ba01b90c04e7 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -45,6 +45,7 @@
#include <net/ipv6_stubs.h>
#include <net/ip6_route.h>
#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include <rdma/ib_sa.h>
#include <rdma/ib.h>
#include <rdma/rdma_netlink.h>
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 43c67e5f43c6..18e476b3ced0 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -78,11 +78,22 @@ enum gid_table_entry_state {
GID_TABLE_ENTRY_PENDING_DEL = 3,
};
+struct roce_gid_ndev_storage {
+ struct rcu_head rcu_head;
+ struct net_device *ndev;
+};
+
struct ib_gid_table_entry {
struct kref kref;
struct work_struct del_work;
struct ib_gid_attr attr;
void *context;
+ /* Store the ndev pointer to release reference later on in
+ * call_rcu context because by that time gid_table_entry
+ * and attr might be already freed. So keep a copy of it.
+ * ndev_storage is freed by rcu callback.
+ */
+ struct roce_gid_ndev_storage *ndev_storage;
enum gid_table_entry_state state;
};
@@ -206,6 +217,20 @@ static void schedule_free_gid(struct kref *kref)
queue_work(ib_wq, &entry->del_work);
}
+static void put_gid_ndev(struct rcu_head *head)
+{
+ struct roce_gid_ndev_storage *storage =
+ container_of(head, struct roce_gid_ndev_storage, rcu_head);
+
+ WARN_ON(!storage->ndev);
+ /* At this point its safe to release netdev reference,
+ * as all callers working on gid_attr->ndev are done
+ * using this netdev.
+ */
+ dev_put(storage->ndev);
+ kfree(storage);
+}
+
static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
{
struct ib_device *device = entry->attr.device;
@@ -228,8 +253,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
/* Now this index is ready to be allocated */
write_unlock_irq(&table->rwlock);
- if (entry->attr.ndev)
- dev_put(entry->attr.ndev);
+ if (entry->ndev_storage)
+ call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev);
kfree(entry);
}
@@ -266,14 +291,25 @@ static struct ib_gid_table_entry *
alloc_gid_entry(const struct ib_gid_attr *attr)
{
struct ib_gid_table_entry *entry;
+ struct net_device *ndev;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return NULL;
+
+ ndev = rcu_dereference_protected(attr->ndev, 1);
+ if (ndev) {
+ entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage),
+ GFP_KERNEL);
+ if (!entry->ndev_storage) {
+ kfree(entry);
+ return NULL;
+ }
+ dev_hold(ndev);
+ entry->ndev_storage->ndev = ndev;
+ }
kref_init(&entry->kref);
memcpy(&entry->attr, attr, sizeof(*attr));
- if (entry->attr.ndev)
- dev_hold(entry->attr.ndev);
INIT_WORK(&entry->del_work, free_gid_work);
entry->state = GID_TABLE_ENTRY_INVALID;
return entry;
@@ -343,6 +379,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry)
static void del_gid(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table, int ix)
{
+ struct roce_gid_ndev_storage *ndev_storage;
struct ib_gid_table_entry *entry;
lockdep_assert_held(&table->lock);
@@ -360,6 +397,13 @@ static void del_gid(struct ib_device *ib_dev, u8 port,
table->data_vec[ix] = NULL;
write_unlock_irq(&table->rwlock);
+ ndev_storage = entry->ndev_storage;
+ if (ndev_storage) {
+ entry->ndev_storage = NULL;
+ rcu_assign_pointer(entry->attr.ndev, NULL);
+ call_rcu(&ndev_storage->rcu_head, put_gid_ndev);
+ }
+
if (rdma_cap_roce_gid_table(ib_dev, port))
ib_dev->ops.del_gid(&entry->attr, &entry->context);
@@ -543,30 +587,11 @@ out_unlock:
int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
union ib_gid *gid, struct ib_gid_attr *attr)
{
- struct net_device *idev;
- unsigned long mask;
- int ret;
-
- idev = ib_device_get_netdev(ib_dev, port);
- if (idev && attr->ndev != idev) {
- union ib_gid default_gid;
-
- /* Adding default GIDs is not permitted */
- make_default_gid(idev, &default_gid);
- if (!memcmp(gid, &default_gid, sizeof(*gid))) {
- dev_put(idev);
- return -EPERM;
- }
- }
- if (idev)
- dev_put(idev);
-
- mask = GID_ATTR_FIND_MASK_GID |
- GID_ATTR_FIND_MASK_GID_TYPE |
- GID_ATTR_FIND_MASK_NETDEV;
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_NETDEV;
- ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
- return ret;
+ return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
}
static int
@@ -1263,11 +1288,72 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr)
read_lock_irqsave(&table->rwlock, flags);
valid = is_gid_entry_valid(table->data_vec[attr->index]);
- if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP))
- ndev = attr->ndev;
+ if (valid) {
+ ndev = rcu_dereference(attr->ndev);
+ if (!ndev ||
+ (ndev && ((READ_ONCE(ndev->flags) & IFF_UP) == 0)))
+ ndev = ERR_PTR(-ENODEV);
+ }
read_unlock_irqrestore(&table->rwlock, flags);
return ndev;
}
+EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu);
+
+static int get_lower_dev_vlan(struct net_device *lower_dev, void *data)
+{
+ u16 *vlan_id = data;
+
+ if (is_vlan_dev(lower_dev))
+ *vlan_id = vlan_dev_vlan_id(lower_dev);
+
+ /* We are interested only in first level vlan device, so
+ * always return 1 to stop iterating over next level devices.
+ */
+ return 1;
+}
+
+/**
+ * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address
+ * of a GID entry.
+ *
+ * @attr: GID attribute pointer whose L2 fields to be read
+ * @vlan_id: Pointer to vlan id to fill up if the GID entry has
+ * vlan id. It is optional.
+ * @smac: Pointer to smac to fill up for a GID entry. It is optional.
+ *
+ * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id
+ * (if gid entry has vlan) and source MAC, or returns error.
+ */
+int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
+ u16 *vlan_id, u8 *smac)
+{
+ struct net_device *ndev;
+
+ rcu_read_lock();
+ ndev = rcu_dereference(attr->ndev);
+ if (!ndev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ if (smac)
+ ether_addr_copy(smac, ndev->dev_addr);
+ if (vlan_id) {
+ *vlan_id = 0xffff;
+ if (is_vlan_dev(ndev)) {
+ *vlan_id = vlan_dev_vlan_id(ndev);
+ } else {
+ /* If the netdev is upper device and if it's lower
+ * device is vlan device, consider vlan id of the
+ * the lower vlan device for this gid entry.
+ */
+ netdev_walk_all_lower_dev_rcu(attr->ndev,
+ get_lower_dev_vlan, vlan_id);
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+EXPORT_SYMBOL(rdma_read_gid_l2_fields);
static int config_non_roce_gid_cache(struct ib_device *device,
u8 port, int gid_tbl_len)
@@ -1392,7 +1478,6 @@ static void ib_cache_event(struct ib_event_handler *handler,
event->event == IB_EVENT_PORT_ACTIVE ||
event->event == IB_EVENT_LID_CHANGE ||
event->event == IB_EVENT_PKEY_CHANGE ||
- event->event == IB_EVENT_SM_CHANGE ||
event->event == IB_EVENT_CLIENT_REREGISTER ||
event->event == IB_EVENT_GID_CHANGE) {
work = kmalloc(sizeof *work, GFP_ATOMIC);
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index b9416a6fca36..da10e6ccb43c 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -52,6 +52,7 @@
#include <rdma/ib_cache.h>
#include <rdma/ib_cm.h>
#include "cm_msgs.h"
+#include "core_priv.h"
MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
@@ -124,7 +125,8 @@ static struct ib_cm {
struct rb_root remote_qp_table;
struct rb_root remote_id_table;
struct rb_root remote_sidr_table;
- struct idr local_id_table;
+ struct xarray local_id_table;
+ u32 local_id_next;
__be32 random_id_operand;
struct list_head timewait_list;
struct workqueue_struct *wq;
@@ -219,7 +221,6 @@ struct cm_port {
struct cm_device {
struct list_head list;
struct ib_device *ib_device;
- struct device *device;
u8 ack_delay;
int going_down;
struct cm_port *port[0];
@@ -598,35 +599,31 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
static int cm_alloc_id(struct cm_id_private *cm_id_priv)
{
- unsigned long flags;
- int id;
-
- idr_preload(GFP_KERNEL);
- spin_lock_irqsave(&cm.lock, flags);
+ int err;
+ u32 id;
- id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
-
- spin_unlock_irqrestore(&cm.lock, flags);
- idr_preload_end();
+ err = xa_alloc_cyclic_irq(&cm.local_id_table, &id, cm_id_priv,
+ xa_limit_32b, &cm.local_id_next, GFP_KERNEL);
cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
- return id < 0 ? id : 0;
+ return err;
+}
+
+static u32 cm_local_id(__be32 local_id)
+{
+ return (__force u32) (local_id ^ cm.random_id_operand);
}
static void cm_free_id(__be32 local_id)
{
- spin_lock_irq(&cm.lock);
- idr_remove(&cm.local_id_table,
- (__force int) (local_id ^ cm.random_id_operand));
- spin_unlock_irq(&cm.lock);
+ xa_erase_irq(&cm.local_id_table, cm_local_id(local_id));
}
static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
{
struct cm_id_private *cm_id_priv;
- cm_id_priv = idr_find(&cm.local_id_table,
- (__force int) (local_id ^ cm.random_id_operand));
+ cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id));
if (cm_id_priv) {
if (cm_id_priv->id.remote_id == remote_id)
atomic_inc(&cm_id_priv->refcount);
@@ -1988,11 +1985,12 @@ static int cm_req_handler(struct cm_work *work)
grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
gid_attr = grh->sgid_attr;
- if (gid_attr && gid_attr->ndev) {
+ if (gid_attr &&
+ rdma_protocol_roce(work->port->cm_dev->ib_device,
+ work->port->port_num)) {
work->path[0].rec_type =
sa_conv_gid_to_pathrec_type(gid_attr->gid_type);
} else {
- /* If no GID attribute or ndev is null, it is not RoCE. */
cm_path_set_rec_type(work->port->cm_dev->ib_device,
work->port->port_num,
&work->path[0],
@@ -2824,9 +2822,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
spin_unlock_irq(&cm.lock);
return NULL;
}
- cm_id_priv = idr_find(&cm.local_id_table, (__force int)
- (timewait_info->work.local_id ^
- cm.random_id_operand));
+ cm_id_priv = xa_load(&cm.local_id_table,
+ cm_local_id(timewait_info->work.local_id));
if (cm_id_priv) {
if (cm_id_priv->id.remote_id == remote_id)
atomic_inc(&cm_id_priv->refcount);
@@ -4276,18 +4273,6 @@ static struct kobj_type cm_counter_obj_type = {
.default_attrs = cm_counter_default_attrs
};
-static void cm_release_port_obj(struct kobject *obj)
-{
- struct cm_port *cm_port;
-
- cm_port = container_of(obj, struct cm_port, port_obj);
- kfree(cm_port);
-}
-
-static struct kobj_type cm_port_obj_type = {
- .release = cm_release_port_obj
-};
-
static char *cm_devnode(struct device *dev, umode_t *mode)
{
if (mode)
@@ -4306,19 +4291,12 @@ static int cm_create_port_fs(struct cm_port *port)
{
int i, ret;
- ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
- &port->cm_dev->device->kobj,
- "%d", port->port_num);
- if (ret) {
- kfree(port);
- return ret;
- }
-
for (i = 0; i < CM_COUNTER_GROUPS; i++) {
- ret = kobject_init_and_add(&port->counter_group[i].obj,
- &cm_counter_obj_type,
- &port->port_obj,
- "%s", counter_group_names[i]);
+ ret = ib_port_register_module_stat(port->cm_dev->ib_device,
+ port->port_num,
+ &port->counter_group[i].obj,
+ &cm_counter_obj_type,
+ counter_group_names[i]);
if (ret)
goto error;
}
@@ -4327,8 +4305,7 @@ static int cm_create_port_fs(struct cm_port *port)
error:
while (i--)
- kobject_put(&port->counter_group[i].obj);
- kobject_put(&port->port_obj);
+ ib_port_unregister_module_stat(&port->counter_group[i].obj);
return ret;
}
@@ -4338,9 +4315,8 @@ static void cm_remove_port_fs(struct cm_port *port)
int i;
for (i = 0; i < CM_COUNTER_GROUPS; i++)
- kobject_put(&port->counter_group[i].obj);
+ ib_port_unregister_module_stat(&port->counter_group[i].obj);
- kobject_put(&port->port_obj);
}
static void cm_add_one(struct ib_device *ib_device)
@@ -4367,13 +4343,6 @@ static void cm_add_one(struct ib_device *ib_device)
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
- cm_dev->device = device_create(&cm_class, &ib_device->dev,
- MKDEV(0, 0), NULL,
- "%s", dev_name(&ib_device->dev));
- if (IS_ERR(cm_dev->device)) {
- kfree(cm_dev);
- return;
- }
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i <= ib_device->phys_port_cnt; i++) {
@@ -4440,7 +4409,6 @@ error1:
cm_remove_port_fs(port);
}
free:
- device_unregister(cm_dev->device);
kfree(cm_dev);
}
@@ -4494,7 +4462,6 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
cm_remove_port_fs(port);
}
- device_unregister(cm_dev->device);
kfree(cm_dev);
}
@@ -4502,7 +4469,6 @@ static int __init ib_cm_init(void)
{
int ret;
- memset(&cm, 0, sizeof cm);
INIT_LIST_HEAD(&cm.device_list);
rwlock_init(&cm.device_lock);
spin_lock_init(&cm.lock);
@@ -4512,7 +4478,7 @@ static int __init ib_cm_init(void)
cm.remote_id_table = RB_ROOT;
cm.remote_qp_table = RB_ROOT;
cm.remote_sidr_table = RB_ROOT;
- idr_init(&cm.local_id_table);
+ xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
INIT_LIST_HEAD(&cm.timewait_list);
@@ -4538,7 +4504,6 @@ error3:
error2:
class_unregister(&cm_class);
error1:
- idr_destroy(&cm.local_id_table);
return ret;
}
@@ -4560,9 +4525,8 @@ static void __exit ib_cm_cleanup(void)
}
class_unregister(&cm_class);
- idr_destroy(&cm.local_id_table);
+ WARN_ON(!xa_empty(&cm.local_id_table));
}
module_init(ib_cm_init);
module_exit(ib_cm_cleanup);
-
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
index 476d4309576d..3d16d614aff6 100644
--- a/drivers/infiniband/core/cm_msgs.h
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -98,7 +98,7 @@ struct cm_req_msg {
u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
-} __attribute__ ((packed));
+} __packed;
static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
{
@@ -423,7 +423,7 @@ enum cm_msg_response {
u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
{
@@ -461,7 +461,7 @@ struct cm_rej_msg {
u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
{
@@ -506,7 +506,7 @@ struct cm_rep_msg {
u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
{
@@ -614,7 +614,7 @@ struct cm_rtu_msg {
u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
struct cm_dreq_msg {
struct ib_mad_hdr hdr;
@@ -626,7 +626,7 @@ struct cm_dreq_msg {
u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
{
@@ -647,7 +647,7 @@ struct cm_drep_msg {
u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
struct cm_lap_msg {
struct ib_mad_hdr hdr;
@@ -675,7 +675,7 @@ struct cm_lap_msg {
u8 offset63;
u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
{
@@ -784,7 +784,7 @@ struct cm_apr_msg {
u8 info[IB_CM_APR_INFO_LENGTH];
u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
struct cm_sidr_req_msg {
struct ib_mad_hdr hdr;
@@ -795,7 +795,7 @@ struct cm_sidr_req_msg {
__be64 service_id;
u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
-} __attribute__ ((packed));
+} __packed;
struct cm_sidr_rep_msg {
struct ib_mad_hdr hdr;
@@ -811,7 +811,7 @@ struct cm_sidr_rep_msg {
u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
{
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 68c997be2429..19f1730a4f24 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -39,7 +39,7 @@
#include <linux/mutex.h>
#include <linux/random.h>
#include <linux/igmp.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
#include <linux/inetdevice.h>
#include <linux/slab.h>
#include <linux/module.h>
@@ -191,10 +191,10 @@ static struct workqueue_struct *cma_wq;
static unsigned int cma_pernet_id;
struct cma_pernet {
- struct idr tcp_ps;
- struct idr udp_ps;
- struct idr ipoib_ps;
- struct idr ib_ps;
+ struct xarray tcp_ps;
+ struct xarray udp_ps;
+ struct xarray ipoib_ps;
+ struct xarray ib_ps;
};
static struct cma_pernet *cma_pernet(struct net *net)
@@ -202,7 +202,8 @@ static struct cma_pernet *cma_pernet(struct net *net)
return net_generic(net, cma_pernet_id);
}
-static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps)
+static
+struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
{
struct cma_pernet *pernet = cma_pernet(net);
@@ -247,25 +248,25 @@ struct class_port_info_context {
static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
struct rdma_bind_list *bind_list, int snum)
{
- struct idr *idr = cma_pernet_idr(net, ps);
+ struct xarray *xa = cma_pernet_xa(net, ps);
- return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+ return xa_insert(xa, snum, bind_list, GFP_KERNEL);
}
static struct rdma_bind_list *cma_ps_find(struct net *net,
enum rdma_ucm_port_space ps, int snum)
{
- struct idr *idr = cma_pernet_idr(net, ps);
+ struct xarray *xa = cma_pernet_xa(net, ps);
- return idr_find(idr, snum);
+ return xa_load(xa, snum);
}
static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,
int snum)
{
- struct idr *idr = cma_pernet_idr(net, ps);
+ struct xarray *xa = cma_pernet_xa(net, ps);
- idr_remove(idr, snum);
+ xa_erase(xa, snum);
}
enum {
@@ -615,6 +616,9 @@ cma_validate_port(struct ib_device *device, u8 port,
int dev_type = dev_addr->dev_type;
struct net_device *ndev = NULL;
+ if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
+ return ERR_PTR(-ENODEV);
+
if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
return ERR_PTR(-ENODEV);
@@ -1173,18 +1177,31 @@ static inline bool cma_any_addr(const struct sockaddr *addr)
return cma_zero_addr(addr) || cma_loopback_addr(addr);
}
-static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst)
{
if (src->sa_family != dst->sa_family)
return -1;
switch (src->sa_family) {
case AF_INET:
- return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
- ((struct sockaddr_in *) dst)->sin_addr.s_addr;
- case AF_INET6:
- return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
- &((struct sockaddr_in6 *) dst)->sin6_addr);
+ return ((struct sockaddr_in *)src)->sin_addr.s_addr !=
+ ((struct sockaddr_in *)dst)->sin_addr.s_addr;
+ case AF_INET6: {
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst;
+ bool link_local;
+
+ if (ipv6_addr_cmp(&src_addr6->sin6_addr,
+ &dst_addr6->sin6_addr))
+ return 1;
+ link_local = ipv6_addr_type(&dst_addr6->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ /* Link local must match their scope_ids */
+ return link_local ? (src_addr6->sin6_scope_id !=
+ dst_addr6->sin6_scope_id) :
+ 0;
+ }
+
default:
return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
&((struct sockaddr_ib *) dst)->sib_addr);
@@ -1469,6 +1486,7 @@ static struct net_device *
roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
{
const struct ib_gid_attr *sgid_attr = NULL;
+ struct net_device *ndev;
if (ib_event->event == IB_CM_REQ_RECEIVED)
sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr;
@@ -1477,8 +1495,15 @@ roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
if (!sgid_attr)
return NULL;
- dev_hold(sgid_attr->ndev);
- return sgid_attr->ndev;
+
+ rcu_read_lock();
+ ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr);
+ if (IS_ERR(ndev))
+ ndev = NULL;
+ else
+ dev_hold(ndev);
+ rcu_read_unlock();
+ return ndev;
}
static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event,
@@ -3247,7 +3272,7 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps,
goto err;
bind_list->ps = ps;
- bind_list->port = (unsigned short)ret;
+ bind_list->port = snum;
cma_bind_port(bind_list, id_priv);
return 0;
err:
@@ -4655,10 +4680,10 @@ static int cma_init_net(struct net *net)
{
struct cma_pernet *pernet = cma_pernet(net);
- idr_init(&pernet->tcp_ps);
- idr_init(&pernet->udp_ps);
- idr_init(&pernet->ipoib_ps);
- idr_init(&pernet->ib_ps);
+ xa_init(&pernet->tcp_ps);
+ xa_init(&pernet->udp_ps);
+ xa_init(&pernet->ipoib_ps);
+ xa_init(&pernet->ib_ps);
return 0;
}
@@ -4667,10 +4692,10 @@ static void cma_exit_net(struct net *net)
{
struct cma_pernet *pernet = cma_pernet(net);
- idr_destroy(&pernet->tcp_ps);
- idr_destroy(&pernet->udp_ps);
- idr_destroy(&pernet->ipoib_ps);
- idr_destroy(&pernet->ib_ps);
+ WARN_ON(!xa_empty(&pernet->tcp_ps));
+ WARN_ON(!xa_empty(&pernet->udp_ps));
+ WARN_ON(!xa_empty(&pernet->ipoib_ps));
+ WARN_ON(!xa_empty(&pernet->ib_ps));
}
static struct pernet_operations cma_pernet_operations = {
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 08c690249594..ff40a450b5d2 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -55,6 +55,7 @@ struct pkey_index_qp_list {
};
extern const struct attribute_group ib_dev_attr_group;
+extern bool ib_devices_shared_netns;
int ib_device_register_sysfs(struct ib_device *device);
void ib_device_unregister_sysfs(struct ib_device *device);
@@ -279,7 +280,8 @@ static inline void ib_mad_agent_security_change(void)
}
#endif
-struct ib_device *ib_device_get_by_index(u32 ifindex);
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index);
+
/* RDMA device netlink */
void nldev_init(void);
void nldev_exit(void);
@@ -302,6 +304,7 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
qp->device = dev;
qp->pd = pd;
qp->uobject = uobj;
+ qp->real_qp = qp;
/*
* We don't track XRC QPs for now, because they don't have PD
* and more importantly they are created internaly by driver,
@@ -336,4 +339,17 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec,
const struct ib_gid_attr *attr);
struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
+
+void ib_free_port_attrs(struct ib_core_device *coredev);
+int ib_setup_port_attrs(struct ib_core_device *coredev);
+
+int rdma_compatdev_set(u8 enable);
+
+int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+ struct kobject *kobj, struct kobj_type *ktype,
+ const char *name);
+void ib_port_unregister_module_stat(struct kobject *kobj);
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+ struct ib_device *dev, u32 ns_fd);
#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index d61e5e1427c2..a4c81992267c 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -128,15 +128,17 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
* @comp_vector: HCA completion vectors for this CQ
* @poll_ctx: context to poll the CQ from.
* @caller: module owner name.
+ * @udata: Valid user data or NULL for kernel object
*
* This is the proper interface to allocate a CQ for in-kernel users. A
* CQ allocated with this interface will automatically be polled from the
* specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
* to use this CQ abstraction.
*/
-struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
- int nr_cqe, int comp_vector,
- enum ib_poll_context poll_ctx, const char *caller)
+struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
+ int nr_cqe, int comp_vector,
+ enum ib_poll_context poll_ctx,
+ const char *caller, struct ib_udata *udata)
{
struct ib_cq_init_attr cq_attr = {
.cqe = nr_cqe,
@@ -145,7 +147,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
struct ib_cq *cq;
int ret = -ENOMEM;
- cq = dev->ops.create_cq(dev, &cq_attr, NULL, NULL);
+ cq = dev->ops.create_cq(dev, &cq_attr, NULL);
if (IS_ERR(cq))
return cq;
@@ -193,16 +195,17 @@ out_free_wc:
kfree(cq->wc);
rdma_restrack_del(&cq->res);
out_destroy_cq:
- cq->device->ops.destroy_cq(cq);
+ cq->device->ops.destroy_cq(cq, udata);
return ERR_PTR(ret);
}
-EXPORT_SYMBOL(__ib_alloc_cq);
+EXPORT_SYMBOL(__ib_alloc_cq_user);
/**
* ib_free_cq - free a completion queue
* @cq: completion queue to free.
+ * @udata: User data or NULL for kernel object
*/
-void ib_free_cq(struct ib_cq *cq)
+void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
{
int ret;
@@ -225,7 +228,7 @@ void ib_free_cq(struct ib_cq *cq)
kfree(cq->wc);
rdma_restrack_del(&cq->res);
- ret = cq->device->ops.destroy_cq(cq);
+ ret = cq->device->ops.destroy_cq(cq, udata);
WARN_ON_ONCE(ret);
}
-EXPORT_SYMBOL(ib_free_cq);
+EXPORT_SYMBOL(ib_free_cq_user);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7421ec4883fb..78dc07c6ac4b 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,6 +38,8 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/hashtable.h>
@@ -101,6 +103,54 @@ static DECLARE_RWSEM(clients_rwsem);
* be registered.
*/
#define CLIENT_DATA_REGISTERED XA_MARK_1
+
+/**
+ * struct rdma_dev_net - rdma net namespace metadata for a net
+ * @net: Pointer to owner net namespace
+ * @id: xarray id to identify the net namespace.
+ */
+struct rdma_dev_net {
+ possible_net_t net;
+ u32 id;
+};
+
+static unsigned int rdma_dev_net_id;
+
+/*
+ * A list of net namespaces is maintained in an xarray. This is necessary
+ * because we can't get the locking right using the existing net ns list. We
+ * would require a init_net callback after the list is updated.
+ */
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
+/*
+ * rwsem to protect accessing the rdma_nets xarray entries.
+ */
+static DECLARE_RWSEM(rdma_nets_rwsem);
+
+bool ib_devices_shared_netns = true;
+module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
+MODULE_PARM_DESC(netns_mode,
+ "Share device among net namespaces; default=1 (shared)");
+/**
+ * rdma_dev_access_netns() - Return whether a rdma device can be accessed
+ * from a specified net namespace or not.
+ * @device: Pointer to rdma device which needs to be checked
+ * @net: Pointer to net namesapce for which access to be checked
+ *
+ * rdma_dev_access_netns() - Return whether a rdma device can be accessed
+ * from a specified net namespace or not. When
+ * rdma device is in shared mode, it ignores the
+ * net namespace. When rdma device is exclusive
+ * to a net namespace, rdma device net namespace is
+ * checked against the specified one.
+ */
+bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
+{
+ return (ib_devices_shared_netns ||
+ net_eq(read_pnet(&dev->coredev.rdma_net), net));
+}
+EXPORT_SYMBOL(rdma_dev_access_netns);
+
/*
* xarray has this behavior where it won't iterate over NULL values stored in
* allocated arrays. So we need our own iterator to see all values stored in
@@ -147,10 +197,73 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
static void ib_policy_change_task(struct work_struct *work);
static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
+static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
+ struct va_format *vaf)
+{
+ if (ibdev && ibdev->dev.parent)
+ dev_printk_emit(level[1] - '0',
+ ibdev->dev.parent,
+ "%s %s %s: %pV",
+ dev_driver_string(ibdev->dev.parent),
+ dev_name(ibdev->dev.parent),
+ dev_name(&ibdev->dev),
+ vaf);
+ else if (ibdev)
+ printk("%s%s: %pV",
+ level, dev_name(&ibdev->dev), vaf);
+ else
+ printk("%s(NULL ib_device): %pV", level, vaf);
+}
+
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
+ const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ __ibdev_printk(level, ibdev, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(ibdev_printk);
+
+#define define_ibdev_printk_level(func, level) \
+void func(const struct ib_device *ibdev, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ __ibdev_printk(level, ibdev, &vaf); \
+ \
+ va_end(args); \
+} \
+EXPORT_SYMBOL(func);
+
+define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
+define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
+define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
+define_ibdev_printk_level(ibdev_err, KERN_ERR);
+define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
+define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
+define_ibdev_printk_level(ibdev_info, KERN_INFO);
+
static struct notifier_block ibdev_lsm_nb = {
.notifier_call = ib_security_change,
};
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+ struct net *net);
+
/* Pointer to the RCU head at the start of the ib_port_data array */
struct ib_port_data_rcu {
struct rcu_head rcu_head;
@@ -200,16 +313,22 @@ static int ib_device_check_mandatory(struct ib_device *device)
* Caller must perform ib_device_put() to return the device reference count
* when ib_device_get_by_index() returns valid device pointer.
*/
-struct ib_device *ib_device_get_by_index(u32 index)
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
{
struct ib_device *device;
down_read(&devices_rwsem);
device = xa_load(&devices, index);
if (device) {
+ if (!rdma_dev_access_netns(device, net)) {
+ device = NULL;
+ goto out;
+ }
+
if (!ib_device_try_get(device))
device = NULL;
}
+out:
up_read(&devices_rwsem);
return device;
}
@@ -268,6 +387,26 @@ struct ib_device *ib_device_get_by_name(const char *name,
}
EXPORT_SYMBOL(ib_device_get_by_name);
+static int rename_compat_devs(struct ib_device *device)
+{
+ struct ib_core_device *cdev;
+ unsigned long index;
+ int ret = 0;
+
+ mutex_lock(&device->compat_devs_mutex);
+ xa_for_each (&device->compat_devs, index, cdev) {
+ ret = device_rename(&cdev->dev, dev_name(&device->dev));
+ if (ret) {
+ dev_warn(&cdev->dev,
+ "Fail to rename compatdev to new name %s\n",
+ dev_name(&device->dev));
+ break;
+ }
+ }
+ mutex_unlock(&device->compat_devs_mutex);
+ return ret;
+}
+
int ib_device_rename(struct ib_device *ibdev, const char *name)
{
int ret;
@@ -287,6 +426,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
if (ret)
goto out;
strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+ ret = rename_compat_devs(ibdev);
out:
up_write(&devices_rwsem);
return ret;
@@ -336,6 +476,7 @@ static void ib_device_release(struct device *device)
WARN_ON(refcount_read(&dev->refcount));
ib_cache_release_one(dev);
ib_security_release_port_pkey_list(dev);
+ xa_destroy(&dev->compat_devs);
xa_destroy(&dev->client_data);
if (dev->port_data)
kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
@@ -357,12 +498,42 @@ static int ib_device_uevent(struct device *device,
return 0;
}
+static const void *net_namespace(struct device *d)
+{
+ struct ib_core_device *coredev =
+ container_of(d, struct ib_core_device, dev);
+
+ return read_pnet(&coredev->rdma_net);
+}
+
static struct class ib_class = {
.name = "infiniband",
.dev_release = ib_device_release,
.dev_uevent = ib_device_uevent,
+ .ns_type = &net_ns_type_operations,
+ .namespace = net_namespace,
};
+static void rdma_init_coredev(struct ib_core_device *coredev,
+ struct ib_device *dev, struct net *net)
+{
+ /* This BUILD_BUG_ON is intended to catch layout change
+ * of union of ib_core_device and device.
+ * dev must be the first element as ib_core and providers
+ * driver uses it. Adding anything in ib_core_device before
+ * device will break this assumption.
+ */
+ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
+ offsetof(struct ib_device, dev));
+
+ coredev->dev.class = &ib_class;
+ coredev->dev.groups = dev->groups;
+ device_initialize(&coredev->dev);
+ coredev->owner = dev;
+ INIT_LIST_HEAD(&coredev->port_list);
+ write_pnet(&coredev->rdma_net, net);
+}
+
/**
* _ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
@@ -389,10 +560,8 @@ struct ib_device *_ib_alloc_device(size_t size)
return NULL;
}
- device->dev.class = &ib_class;
device->groups[0] = &ib_dev_attr_group;
- device->dev.groups = device->groups;
- device_initialize(&device->dev);
+ rdma_init_coredev(&device->coredev, device, &init_net);
INIT_LIST_HEAD(&device->event_handler_list);
spin_lock_init(&device->event_handler_lock);
@@ -403,7 +572,8 @@ struct ib_device *_ib_alloc_device(size_t size)
*/
xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
init_rwsem(&device->client_data_rwsem);
- INIT_LIST_HEAD(&device->port_list);
+ xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
+ mutex_init(&device->compat_devs_mutex);
init_completion(&device->unreg_completion);
INIT_WORK(&device->unregistration_work, ib_unregister_work);
@@ -436,6 +606,7 @@ void ib_dealloc_device(struct ib_device *device)
/* Expedite releasing netdev references */
free_netdevs(device);
+ WARN_ON(!xa_empty(&device->compat_devs));
WARN_ON(!xa_empty(&device->client_data));
WARN_ON(refcount_read(&device->refcount));
rdma_restrack_clean(device);
@@ -644,6 +815,283 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
return NOTIFY_OK;
}
+static void compatdev_release(struct device *dev)
+{
+ struct ib_core_device *cdev =
+ container_of(dev, struct ib_core_device, dev);
+
+ kfree(cdev);
+}
+
+static int add_one_compat_dev(struct ib_device *device,
+ struct rdma_dev_net *rnet)
+{
+ struct ib_core_device *cdev;
+ int ret;
+
+ lockdep_assert_held(&rdma_nets_rwsem);
+ if (!ib_devices_shared_netns)
+ return 0;
+
+ /*
+ * Create and add compat device in all namespaces other than where it
+ * is currently bound to.
+ */
+ if (net_eq(read_pnet(&rnet->net),
+ read_pnet(&device->coredev.rdma_net)))
+ return 0;
+
+ /*
+ * The first of init_net() or ib_register_device() to take the
+ * compat_devs_mutex wins and gets to add the device. Others will wait
+ * for completion here.
+ */
+ mutex_lock(&device->compat_devs_mutex);
+ cdev = xa_load(&device->compat_devs, rnet->id);
+ if (cdev) {
+ ret = 0;
+ goto done;
+ }
+ ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
+ if (ret)
+ goto done;
+
+ cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+ if (!cdev) {
+ ret = -ENOMEM;
+ goto cdev_err;
+ }
+
+ cdev->dev.parent = device->dev.parent;
+ rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
+ cdev->dev.release = compatdev_release;
+ dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
+
+ ret = device_add(&cdev->dev);
+ if (ret)
+ goto add_err;
+ ret = ib_setup_port_attrs(cdev);
+ if (ret)
+ goto port_err;
+
+ ret = xa_err(xa_store(&device->compat_devs, rnet->id,
+ cdev, GFP_KERNEL));
+ if (ret)
+ goto insert_err;
+
+ mutex_unlock(&device->compat_devs_mutex);
+ return 0;
+
+insert_err:
+ ib_free_port_attrs(cdev);
+port_err:
+ device_del(&cdev->dev);
+add_err:
+ put_device(&cdev->dev);
+cdev_err:
+ xa_release(&device->compat_devs, rnet->id);
+done:
+ mutex_unlock(&device->compat_devs_mutex);
+ return ret;
+}
+
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
+{
+ struct ib_core_device *cdev;
+
+ mutex_lock(&device->compat_devs_mutex);
+ cdev = xa_erase(&device->compat_devs, id);
+ mutex_unlock(&device->compat_devs_mutex);
+ if (cdev) {
+ ib_free_port_attrs(cdev);
+ device_del(&cdev->dev);
+ put_device(&cdev->dev);
+ }
+}
+
+static void remove_compat_devs(struct ib_device *device)
+{
+ struct ib_core_device *cdev;
+ unsigned long index;
+
+ xa_for_each (&device->compat_devs, index, cdev)
+ remove_one_compat_dev(device, index);
+}
+
+static int add_compat_devs(struct ib_device *device)
+{
+ struct rdma_dev_net *rnet;
+ unsigned long index;
+ int ret = 0;
+
+ lockdep_assert_held(&devices_rwsem);
+
+ down_read(&rdma_nets_rwsem);
+ xa_for_each (&rdma_nets, index, rnet) {
+ ret = add_one_compat_dev(device, rnet);
+ if (ret)
+ break;
+ }
+ up_read(&rdma_nets_rwsem);
+ return ret;
+}
+
+static void remove_all_compat_devs(void)
+{
+ struct ib_compat_device *cdev;
+ struct ib_device *dev;
+ unsigned long index;
+
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, dev) {
+ unsigned long c_index = 0;
+
+ /* Hold nets_rwsem so that any other thread modifying this
+ * system param can sync with this thread.
+ */
+ down_read(&rdma_nets_rwsem);
+ xa_for_each (&dev->compat_devs, c_index, cdev)
+ remove_one_compat_dev(dev, c_index);
+ up_read(&rdma_nets_rwsem);
+ }
+ up_read(&devices_rwsem);
+}
+
+static int add_all_compat_devs(void)
+{
+ struct rdma_dev_net *rnet;
+ struct ib_device *dev;
+ unsigned long index;
+ int ret = 0;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ unsigned long net_index = 0;
+
+ /* Hold nets_rwsem so that any other thread modifying this
+ * system param can sync with this thread.
+ */
+ down_read(&rdma_nets_rwsem);
+ xa_for_each (&rdma_nets, net_index, rnet) {
+ ret = add_one_compat_dev(dev, rnet);
+ if (ret)
+ break;
+ }
+ up_read(&rdma_nets_rwsem);
+ }
+ up_read(&devices_rwsem);
+ if (ret)
+ remove_all_compat_devs();
+ return ret;
+}
+
+int rdma_compatdev_set(u8 enable)
+{
+ struct rdma_dev_net *rnet;
+ unsigned long index;
+ int ret = 0;
+
+ down_write(&rdma_nets_rwsem);
+ if (ib_devices_shared_netns == enable) {
+ up_write(&rdma_nets_rwsem);
+ return 0;
+ }
+
+ /* enable/disable of compat devices is not supported
+ * when more than default init_net exists.
+ */
+ xa_for_each (&rdma_nets, index, rnet) {
+ ret++;
+ break;
+ }
+ if (!ret)
+ ib_devices_shared_netns = enable;
+ up_write(&rdma_nets_rwsem);
+ if (ret)
+ return -EBUSY;
+
+ if (enable)
+ ret = add_all_compat_devs();
+ else
+ remove_all_compat_devs();
+ return ret;
+}
+
+static void rdma_dev_exit_net(struct net *net)
+{
+ struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+ struct ib_device *dev;
+ unsigned long index;
+ int ret;
+
+ down_write(&rdma_nets_rwsem);
+ /*
+ * Prevent the ID from being re-used and hide the id from xa_for_each.
+ */
+ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
+ WARN_ON(ret);
+ up_write(&rdma_nets_rwsem);
+
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, dev) {
+ get_device(&dev->dev);
+ /*
+ * Release the devices_rwsem so that pontentially blocking
+ * device_del, doesn't hold the devices_rwsem for too long.
+ */
+ up_read(&devices_rwsem);
+
+ remove_one_compat_dev(dev, rnet->id);
+
+ /*
+ * If the real device is in the NS then move it back to init.
+ */
+ rdma_dev_change_netns(dev, net, &init_net);
+
+ put_device(&dev->dev);
+ down_read(&devices_rwsem);
+ }
+ up_read(&devices_rwsem);
+
+ xa_erase(&rdma_nets, rnet->id);
+}
+
+static __net_init int rdma_dev_init_net(struct net *net)
+{
+ struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+ unsigned long index;
+ struct ib_device *dev;
+ int ret;
+
+ /* No need to create any compat devices in default init_net. */
+ if (net_eq(net, &init_net))
+ return 0;
+
+ write_pnet(&rnet->net, net);
+
+ ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ /* Hold nets_rwsem so that netlink command cannot change
+ * system configuration for device sharing mode.
+ */
+ down_read(&rdma_nets_rwsem);
+ ret = add_one_compat_dev(dev, rnet);
+ up_read(&rdma_nets_rwsem);
+ if (ret)
+ break;
+ }
+ up_read(&devices_rwsem);
+
+ if (ret)
+ rdma_dev_exit_net(net);
+
+ return ret;
+}
+
/*
* Assign the unique string device name and the unique device index. This is
* undone by ib_dealloc_device.
@@ -711,6 +1159,9 @@ static void setup_dma_device(struct ib_device *device)
WARN_ON_ONCE(!parent);
device->dma_device = parent;
}
+ /* Setup default max segment size for all IB devices */
+ dma_set_max_seg_size(device->dma_device, SZ_2G);
+
}
/*
@@ -765,8 +1216,12 @@ static void disable_device(struct ib_device *device)
ib_device_put(device);
wait_for_completion(&device->unreg_completion);
- /* Expedite removing unregistered pointers from the hash table */
- free_netdevs(device);
+ /*
+ * compat devices must be removed after device refcount drops to zero.
+ * Otherwise init_net() may add more compatdevs after removing compat
+ * devices and before device is disabled.
+ */
+ remove_compat_devs(device);
}
/*
@@ -807,7 +1262,8 @@ static int enable_device_and_get(struct ib_device *device)
break;
}
up_read(&clients_rwsem);
-
+ if (!ret)
+ ret = add_compat_devs(device);
out:
up_read(&devices_rwsem);
return ret;
@@ -847,6 +1303,11 @@ int ib_register_device(struct ib_device *device, const char *name)
ib_device_register_rdmacg(device);
+ /*
+ * Ensure that ADD uevent is not fired because it
+ * is too early amd device is not initialized yet.
+ */
+ dev_set_uevent_suppress(&device->dev, true);
ret = device_add(&device->dev);
if (ret)
goto cg_cleanup;
@@ -859,6 +1320,9 @@ int ib_register_device(struct ib_device *device, const char *name)
}
ret = enable_device_and_get(device);
+ dev_set_uevent_suppress(&device->dev, false);
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
if (ret) {
void (*dealloc_fn)(struct ib_device *);
@@ -887,6 +1351,7 @@ int ib_register_device(struct ib_device *device, const char *name)
dev_cleanup:
device_del(&device->dev);
cg_cleanup:
+ dev_set_uevent_suppress(&device->dev, false);
ib_device_unregister_rdmacg(device);
ib_cache_cleanup_one(device);
return ret;
@@ -908,6 +1373,10 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
goto out;
disable_device(ib_dev);
+
+ /* Expedite removing unregistered pointers from the hash table */
+ free_netdevs(ib_dev);
+
ib_device_unregister_sysfs(ib_dev);
device_del(&ib_dev->dev);
ib_device_unregister_rdmacg(ib_dev);
@@ -1038,6 +1507,126 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
}
EXPORT_SYMBOL(ib_unregister_device_queued);
+/*
+ * The caller must pass in a device that has the kref held and the refcount
+ * released. If the device is in cur_net and still registered then it is moved
+ * into net.
+ */
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+ struct net *net)
+{
+ int ret2 = -EINVAL;
+ int ret;
+
+ mutex_lock(&device->unregistration_lock);
+
+ /*
+ * If a device not under ib_device_get() or if the unregistration_lock
+ * is not held, the namespace can be changed, or it can be unregistered.
+ * Check again under the lock.
+ */
+ if (refcount_read(&device->refcount) == 0 ||
+ !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
+ disable_device(device);
+
+ /*
+ * At this point no one can be using the device, so it is safe to
+ * change the namespace.
+ */
+ write_pnet(&device->coredev.rdma_net, net);
+
+ down_read(&devices_rwsem);
+ /*
+ * Currently rdma devices are system wide unique. So the device name
+ * is guaranteed free in the new namespace. Publish the new namespace
+ * at the sysfs level.
+ */
+ ret = device_rename(&device->dev, dev_name(&device->dev));
+ up_read(&devices_rwsem);
+ if (ret) {
+ dev_warn(&device->dev,
+ "%s: Couldn't rename device after namespace change\n",
+ __func__);
+ /* Try and put things back and re-enable the device */
+ write_pnet(&device->coredev.rdma_net, cur_net);
+ }
+
+ ret2 = enable_device_and_get(device);
+ if (ret2) {
+ /*
+ * This shouldn't really happen, but if it does, let the user
+ * retry at later point. So don't disable the device.
+ */
+ dev_warn(&device->dev,
+ "%s: Couldn't re-enable device after namespace change\n",
+ __func__);
+ }
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+ ib_device_put(device);
+out:
+ mutex_unlock(&device->unregistration_lock);
+ if (ret)
+ return ret;
+ return ret2;
+}
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+ struct ib_device *dev, u32 ns_fd)
+{
+ struct net *net;
+ int ret;
+
+ net = get_net_ns_by_fd(ns_fd);
+ if (IS_ERR(net)) {
+ ret = PTR_ERR(net);
+ goto net_err;
+ }
+
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto ns_err;
+ }
+
+ /*
+ * Currently supported only for those providers which support
+ * disassociation and don't do port specific sysfs init. Once a
+ * port_cleanup infrastructure is implemented, this limitation will be
+ * removed.
+ */
+ if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
+ ib_devices_shared_netns) {
+ ret = -EOPNOTSUPP;
+ goto ns_err;
+ }
+
+ get_device(&dev->dev);
+ ib_device_put(dev);
+ ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
+ put_device(&dev->dev);
+
+ put_net(net);
+ return ret;
+
+ns_err:
+ put_net(net);
+net_err:
+ ib_device_put(dev);
+ return ret;
+}
+
+static struct pernet_operations rdma_dev_net_ops = {
+ .init = rdma_dev_init_net,
+ .exit = rdma_dev_exit_net,
+ .id = &rdma_dev_net_id,
+ .size = sizeof(struct rdma_dev_net),
+};
+
static int assign_client_id(struct ib_client *client)
{
int ret;
@@ -1515,6 +2104,9 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
down_read(&devices_rwsem);
xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
+ continue;
+
ret = nldev_cb(dev, skb, cb, idx);
if (ret)
break;
@@ -1787,6 +2379,14 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, get_vf_config);
SET_DEVICE_OP(dev_ops, get_vf_stats);
SET_DEVICE_OP(dev_ops, init_port);
+ SET_DEVICE_OP(dev_ops, iw_accept);
+ SET_DEVICE_OP(dev_ops, iw_add_ref);
+ SET_DEVICE_OP(dev_ops, iw_connect);
+ SET_DEVICE_OP(dev_ops, iw_create_listen);
+ SET_DEVICE_OP(dev_ops, iw_destroy_listen);
+ SET_DEVICE_OP(dev_ops, iw_get_qp);
+ SET_DEVICE_OP(dev_ops, iw_reject);
+ SET_DEVICE_OP(dev_ops, iw_rem_ref);
SET_DEVICE_OP(dev_ops, map_mr_sg);
SET_DEVICE_OP(dev_ops, map_phys_fmr);
SET_DEVICE_OP(dev_ops, mmap);
@@ -1823,7 +2423,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, set_vf_link_state);
SET_DEVICE_OP(dev_ops, unmap_fmr);
+ SET_OBJ_SIZE(dev_ops, ib_ah);
SET_OBJ_SIZE(dev_ops, ib_pd);
+ SET_OBJ_SIZE(dev_ops, ib_srq);
SET_OBJ_SIZE(dev_ops, ib_ucontext);
}
EXPORT_SYMBOL(ib_set_device_ops);
@@ -1903,12 +2505,20 @@ static int __init ib_core_init(void)
goto err_sa;
}
+ ret = register_pernet_device(&rdma_dev_net_ops);
+ if (ret) {
+ pr_warn("Couldn't init compat dev. ret %d\n", ret);
+ goto err_compat;
+ }
+
nldev_init();
rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
roce_gid_mgmt_init();
return 0;
+err_compat:
+ unregister_lsm_notifier(&ibdev_lsm_nb);
err_sa:
ib_sa_cleanup();
err_mad:
@@ -1933,6 +2543,7 @@ static void __exit ib_core_cleanup(void)
roce_gid_mgmt_cleanup();
nldev_exit();
rdma_nl_unregister(RDMA_NL_LS);
+ unregister_pernet_device(&rdma_dev_net_ops);
unregister_lsm_notifier(&ibdev_lsm_nb);
ib_sa_cleanup();
ib_mad_cleanup();
@@ -1950,5 +2561,8 @@ static void __exit ib_core_cleanup(void)
MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
-subsys_initcall(ib_core_init);
+/* ib core relies on netdev stack to first register net_ns_type_operations
+ * ns kobject type before ib_core initialization.
+ */
+fs_initcall(ib_core_init);
module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 732637c913d9..72141c5b7c95 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -394,7 +394,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
cm_id_priv->state = IW_CM_STATE_DESTROYING;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
/* destroy the listening endpoint */
- cm_id->device->iwcm->destroy_listen(cm_id);
+ cm_id->device->ops.iw_destroy_listen(cm_id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
break;
case IW_CM_STATE_ESTABLISHED:
@@ -417,7 +417,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
*/
cm_id_priv->state = IW_CM_STATE_DESTROYING;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- cm_id->device->iwcm->reject(cm_id, NULL, 0);
+ cm_id->device->ops.iw_reject(cm_id, NULL, 0);
spin_lock_irqsave(&cm_id_priv->lock, flags);
break;
case IW_CM_STATE_CONN_SENT:
@@ -427,7 +427,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
break;
}
if (cm_id_priv->qp) {
- cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
cm_id_priv->qp = NULL;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -504,7 +504,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
{
const char *devname = dev_name(&cm_id->device->dev);
- const char *ifname = cm_id->device->iwcm->ifname;
+ const char *ifname = cm_id->device->iw_ifname;
struct iwpm_dev_data pm_reg_msg = {};
struct iwpm_sa_data pm_msg;
int status;
@@ -526,7 +526,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
cm_id->mapped = true;
pm_msg.loc_addr = cm_id->local_addr;
pm_msg.rem_addr = cm_id->remote_addr;
- pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ?
+ pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ?
IWPM_FLAGS_NO_PORT_MAP : 0;
if (active)
status = iwpm_add_and_query_mapping(&pm_msg,
@@ -577,7 +577,8 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
ret = iw_cm_map(cm_id, false);
if (!ret)
- ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+ ret = cm_id->device->ops.iw_create_listen(cm_id,
+ backlog);
if (ret)
cm_id_priv->state = IW_CM_STATE_IDLE;
spin_lock_irqsave(&cm_id_priv->lock, flags);
@@ -617,7 +618,7 @@ int iw_cm_reject(struct iw_cm_id *cm_id,
cm_id_priv->state = IW_CM_STATE_IDLE;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- ret = cm_id->device->iwcm->reject(cm_id, private_data,
+ ret = cm_id->device->ops.iw_reject(cm_id, private_data,
private_data_len);
clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
@@ -653,25 +654,25 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
return -EINVAL;
}
/* Get the ib_qp given the QPN */
- qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
if (!qp) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
wake_up_all(&cm_id_priv->connect_wait);
return -EINVAL;
}
- cm_id->device->iwcm->add_ref(qp);
+ cm_id->device->ops.iw_add_ref(qp);
cm_id_priv->qp = qp;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+ ret = cm_id->device->ops.iw_accept(cm_id, iw_param);
if (ret) {
/* An error on accept precludes provider events */
BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
cm_id_priv->state = IW_CM_STATE_IDLE;
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id_priv->qp) {
- cm_id->device->iwcm->rem_ref(qp);
+ cm_id->device->ops.iw_rem_ref(qp);
cm_id_priv->qp = NULL;
}
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -712,25 +713,25 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
}
/* Get the ib_qp given the QPN */
- qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
if (!qp) {
ret = -EINVAL;
goto err;
}
- cm_id->device->iwcm->add_ref(qp);
+ cm_id->device->ops.iw_add_ref(qp);
cm_id_priv->qp = qp;
cm_id_priv->state = IW_CM_STATE_CONN_SENT;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
ret = iw_cm_map(cm_id, true);
if (!ret)
- ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+ ret = cm_id->device->ops.iw_connect(cm_id, iw_param);
if (!ret)
return 0; /* success */
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id_priv->qp) {
- cm_id->device->iwcm->rem_ref(qp);
+ cm_id->device->ops.iw_rem_ref(qp);
cm_id_priv->qp = NULL;
}
cm_id_priv->state = IW_CM_STATE_IDLE;
@@ -895,7 +896,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
} else {
/* REJECTED or RESET */
- cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
cm_id_priv->qp = NULL;
cm_id_priv->state = IW_CM_STATE_IDLE;
}
@@ -946,7 +947,7 @@ static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id_priv->qp) {
- cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
cm_id_priv->qp = NULL;
}
switch (cm_id_priv->state) {
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index e742a6a2c138..cc99479b2c09 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -3,7 +3,7 @@
* Copyright (c) 2005 Intel Corporation. All rights reserved.
* Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
* Copyright (c) 2009 HNR Consulting. All rights reserved.
- * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014,2018 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -38,10 +38,10 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/dma-mapping.h>
-#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
+#include <linux/xarray.h>
#include <rdma/ib_cache.h>
#include "mad_priv.h"
@@ -51,6 +51,32 @@
#include "opa_smi.h"
#include "agent.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_mad.h>
+
+#ifdef CONFIG_TRACEPOINTS
+static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_qp_info *qp_info,
+ struct trace_event_raw_ib_mad_send_template *entry)
+{
+ u16 pkey;
+ struct ib_device *dev = qp_info->port_priv->device;
+ u8 pnum = qp_info->port_priv->port_num;
+ struct ib_ud_wr *wr = &mad_send_wr->send_wr;
+ struct rdma_ah_attr attr = {};
+
+ rdma_query_ah(wr->ah, &attr);
+
+ /* These are common */
+ entry->sl = attr.sl;
+ ib_query_pkey(dev, pnum, wr->pkey_index, &pkey);
+ entry->pkey = pkey;
+ entry->rqpn = wr->remote_qpn;
+ entry->rqkey = wr->remote_qkey;
+ entry->dlid = rdma_ah_get_dlid(&attr);
+}
+#endif
+
static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
@@ -59,12 +85,9 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
-/*
- * The mlx4 driver uses the top byte to distinguish which virtual function
- * generated the MAD, so we must avoid using it.
- */
-#define AGENT_ID_LIMIT (1 << 24)
-static DEFINE_IDR(ib_mad_clients);
+/* Client ID 0 is used for snoop-only clients */
+static DEFINE_XARRAY_ALLOC1(ib_mad_clients);
+static u32 ib_mad_client_next;
static struct list_head ib_mad_port_list;
/* Port list lock */
@@ -389,18 +412,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
goto error4;
}
- idr_preload(GFP_KERNEL);
- idr_lock(&ib_mad_clients);
- ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0,
- AGENT_ID_LIMIT, GFP_ATOMIC);
- idr_unlock(&ib_mad_clients);
- idr_preload_end();
-
+ /*
+ * The mlx4 driver uses the top byte to distinguish which virtual
+ * function generated the MAD, so we must avoid using it.
+ */
+ ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid,
+ mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1),
+ &ib_mad_client_next, GFP_KERNEL);
if (ret2 < 0) {
ret = ERR_PTR(ret2);
goto error5;
}
- mad_agent_priv->agent.hi_tid = ret2;
/*
* Make sure MAD registration (if supplied)
@@ -445,12 +467,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
}
spin_unlock_irq(&port_priv->reg_lock);
+ trace_ib_mad_create_agent(mad_agent_priv);
return &mad_agent_priv->agent;
error6:
spin_unlock_irq(&port_priv->reg_lock);
- idr_lock(&ib_mad_clients);
- idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
- idr_unlock(&ib_mad_clients);
+ xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
error5:
ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
error4:
@@ -602,6 +623,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
struct ib_mad_port_private *port_priv;
/* Note that we could still be handling received MADs */
+ trace_ib_mad_unregister_agent(mad_agent_priv);
/*
* Canceling all sends results in dropping received response
@@ -614,9 +636,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
spin_lock_irq(&port_priv->reg_lock);
remove_mad_reg_req(mad_agent_priv);
spin_unlock_irq(&port_priv->reg_lock);
- idr_lock(&ib_mad_clients);
- idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
- idr_unlock(&ib_mad_clients);
+ xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
flush_workqueue(port_priv->wq);
ib_cancel_rmpp_recvs(mad_agent_priv);
@@ -821,6 +841,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
if (opa && smp->class_version == OPA_SM_CLASS_VERSION) {
u32 opa_drslid;
+ trace_ib_mad_handle_out_opa_smi(opa_smp);
+
if ((opa_get_smp_direction(opa_smp)
? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
OPA_LID_PERMISSIVE &&
@@ -846,6 +868,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
goto out;
} else {
+ trace_ib_mad_handle_out_ib_smi(smp);
+
if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
IB_LID_PERMISSIVE &&
smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
@@ -1223,6 +1247,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
spin_lock_irqsave(&qp_info->send_queue.lock, flags);
if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+ trace_ib_mad_ib_send_mad(mad_send_wr, qp_info);
ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
NULL);
list = &qp_info->send_queue.list;
@@ -1756,7 +1781,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
*/
hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
rcu_read_lock();
- mad_agent = idr_find(&ib_mad_clients, hi_tid);
+ mad_agent = xa_load(&ib_mad_clients, hi_tid);
if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount))
mad_agent = NULL;
rcu_read_unlock();
@@ -2077,6 +2102,8 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv
enum smi_forward_action retsmi;
struct ib_smp *smp = (struct ib_smp *)recv->mad;
+ trace_ib_mad_handle_ib_smi(smp);
+
if (smi_handle_dr_smp_recv(smp,
rdma_cap_ib_switch(port_priv->device),
port_num,
@@ -2162,6 +2189,8 @@ handle_opa_smi(struct ib_mad_port_private *port_priv,
enum smi_forward_action retsmi;
struct opa_smp *smp = (struct opa_smp *)recv->mad;
+ trace_ib_mad_handle_opa_smi(smp);
+
if (opa_smi_handle_dr_smp_recv(smp,
rdma_cap_ib_switch(port_priv->device),
port_num,
@@ -2286,6 +2315,9 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
goto out;
+ trace_ib_mad_recv_done_handler(qp_info, wc,
+ (struct ib_mad_hdr *)recv->mad);
+
mad_size = recv->mad_size;
response = alloc_mad_private(mad_size, GFP_KERNEL);
if (!response)
@@ -2332,6 +2364,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
if (mad_agent) {
+ trace_ib_mad_recv_done_agent(mad_agent);
ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
/*
* recv is freed up in error cases in ib_mad_complete_recv
@@ -2496,6 +2529,9 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
send_queue = mad_list->mad_queue;
qp_info = send_queue->qp_info;
+ trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv);
+ trace_ib_mad_send_done_handler(mad_send_wr, wc);
+
retry:
ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
mad_send_wr->header_mapping,
@@ -2527,6 +2563,7 @@ retry:
ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
if (queued_send_wr) {
+ trace_ib_mad_send_done_resend(queued_send_wr, qp_info);
ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
NULL);
if (ret) {
@@ -2574,6 +2611,7 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
if (mad_send_wr->retry) {
/* Repost send */
mad_send_wr->retry = 0;
+ trace_ib_mad_error_handler(mad_send_wr, qp_info);
ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
NULL);
if (!ret)
@@ -3356,9 +3394,6 @@ int ib_mad_init(void)
INIT_LIST_HEAD(&ib_mad_port_list);
- /* Client ID 0 is used for snoop-only clients */
- idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL);
-
if (ib_register_client(&mad_client)) {
pr_err("Couldn't register ib_mad client\n");
return -EINVAL;
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 216509036aa8..956b3a7dfed7 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -73,14 +73,14 @@ struct ib_mad_private_header {
struct ib_mad_recv_wc recv_wc;
struct ib_wc wc;
u64 mapping;
-} __attribute__ ((packed));
+} __packed;
struct ib_mad_private {
struct ib_mad_private_header header;
size_t mad_size;
struct ib_grh grh;
u8 mad[0];
-} __attribute__ ((packed));
+} __packed;
struct ib_rmpp_segment {
struct list_head list;
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index d50ff70bb24b..cd338ddc4a39 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -804,7 +804,6 @@ static void mcast_event_handler(struct ib_event_handler *handler,
switch (event->event) {
case IB_EVENT_PORT_ERR:
case IB_EVENT_LID_CHANGE:
- case IB_EVENT_SM_CHANGE:
case IB_EVENT_CLIENT_REREGISTER:
mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
break;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 85324012bf07..98eadd3089ce 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -116,6 +116,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING,
.len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+ [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+ [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 },
};
static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -198,6 +202,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
{
char fw[IB_FW_VERSION_NAME_MAX];
+ int ret = 0;
+ u8 port;
if (fill_nldev_handle(msg, device))
return -EMSGSIZE;
@@ -226,7 +232,25 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
return -EMSGSIZE;
if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
return -EMSGSIZE;
- return 0;
+
+ /*
+ * Link type is determined on first port and mlx4 device
+ * which can potentially have two different link type for the same
+ * IB device is considered as better to be avoided in the future,
+ */
+ port = rdma_start_port(device);
+ if (rdma_cap_opa_mad(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa");
+ else if (rdma_protocol_ib(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib");
+ else if (rdma_protocol_iwarp(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw");
+ else if (rdma_protocol_roce(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce");
+ else if (rdma_protocol_usnic(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL,
+ "usnic");
+ return ret;
}
static int fill_port_info(struct sk_buff *msg,
@@ -615,7 +639,7 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(index);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
if (!device)
return -EINVAL;
@@ -659,7 +683,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(index);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
if (!device)
return -EINVAL;
@@ -669,9 +693,20 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
IB_DEVICE_NAME_MAX);
err = ib_device_rename(device, name);
+ goto done;
}
+ if (tb[RDMA_NLDEV_NET_NS_FD]) {
+ u32 ns_fd;
+
+ ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]);
+ err = ib_device_set_netns_put(skb, device, ns_fd);
+ goto put_done;
+ }
+
+done:
ib_device_put(device);
+put_done:
return err;
}
@@ -707,7 +742,7 @@ static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
/*
* There is no need to take lock, because
- * we are relying on ib_core's lists_rwsem
+ * we are relying on ib_core's locking.
*/
return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);
}
@@ -730,7 +765,7 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(index);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
if (!device)
return -EINVAL;
@@ -784,7 +819,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
return -EINVAL;
ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(ifindex);
+ device = ib_device_get_by_index(sock_net(skb->sk), ifindex);
if (!device)
return -EINVAL;
@@ -839,7 +874,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(index);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
if (!device)
return -EINVAL;
@@ -887,7 +922,6 @@ static int _nldev_res_get_dumpit(struct ib_device *device,
nlmsg_cancel(skb, nlh);
goto out;
}
-
nlmsg_end(skb, nlh);
idx++;
@@ -988,7 +1022,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(index);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
if (!device)
return -EINVAL;
@@ -1085,7 +1119,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
return -EINVAL;
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(index);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
if (!device)
return -EINVAL;
@@ -1300,7 +1334,7 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
- device = ib_device_get_by_index(index);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
if (!device)
return -EINVAL;
@@ -1313,6 +1347,55 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
}
+static int nldev_get_sys_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct nlmsghdr *nlh;
+ int err;
+
+ err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NULL);
+ if (err)
+ return err;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_SYS_GET),
+ 0, 0);
+
+ err = nla_put_u8(skb, RDMA_NLDEV_SYS_ATTR_NETNS_MODE,
+ (u8)ib_devices_shared_netns);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+}
+
+static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ u8 enable;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE])
+ return -EINVAL;
+
+ enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]);
+ /* Only 0 and 1 are supported */
+ if (enable > 1)
+ return -EINVAL;
+
+ err = rdma_compatdev_set(enable);
+ return err;
+}
+
static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
[RDMA_NLDEV_CMD_GET] = {
.doit = nldev_get_doit,
@@ -1358,6 +1441,13 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.doit = nldev_res_get_pd_doit,
.dump = nldev_res_get_pd_dumpit,
},
+ [RDMA_NLDEV_CMD_SYS_GET] = {
+ .dump = nldev_get_sys_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_SYS_SET] = {
+ .doit = nldev_set_sys_set_doit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
};
void __init nldev_init(void)
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 778375ff664e..ccf4d069c25c 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -125,9 +125,10 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj,
* and consumes the kref on the uobj.
*/
static int uverbs_destroy_uobject(struct ib_uobject *uobj,
- enum rdma_remove_reason reason)
+ enum rdma_remove_reason reason,
+ struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_file *ufile = uobj->ufile;
+ struct ib_uverbs_file *ufile = attrs->ufile;
unsigned long flags;
int ret;
@@ -135,7 +136,8 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj,
assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE);
if (uobj->object) {
- ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason);
+ ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason,
+ attrs);
if (ret) {
if (ib_is_destroy_retryable(ret, reason, uobj))
return ret;
@@ -196,9 +198,9 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj,
* version requires the caller to have already obtained an
* LOOKUP_DESTROY uobject kref.
*/
-int uobj_destroy(struct ib_uobject *uobj)
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_file *ufile = uobj->ufile;
+ struct ib_uverbs_file *ufile = attrs->ufile;
int ret;
down_read(&ufile->hw_destroy_rwsem);
@@ -207,7 +209,7 @@ int uobj_destroy(struct ib_uobject *uobj)
if (ret)
goto out_unlock;
- ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY);
+ ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs);
if (ret) {
atomic_set(&uobj->usecnt, 0);
goto out_unlock;
@@ -224,18 +226,17 @@ out_unlock:
* uverbs_put_destroy.
*/
struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
- u32 id,
- const struct uverbs_attr_bundle *attrs)
+ u32 id, struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj;
int ret;
uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id,
- UVERBS_LOOKUP_DESTROY);
+ UVERBS_LOOKUP_DESTROY, attrs);
if (IS_ERR(uobj))
return uobj;
- ret = uobj_destroy(uobj);
+ ret = uobj_destroy(uobj, attrs);
if (ret) {
rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
return ERR_PTR(ret);
@@ -249,7 +250,7 @@ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
* (negative errno on failure). For use by callers that do not need the uobj.
*/
int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
- const struct uverbs_attr_bundle *attrs)
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj;
@@ -296,25 +297,13 @@ static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile,
static int idr_add_uobj(struct ib_uobject *uobj)
{
- int ret;
-
- idr_preload(GFP_KERNEL);
- spin_lock(&uobj->ufile->idr_lock);
-
- /*
- * We start with allocating an idr pointing to NULL. This represents an
- * object which isn't initialized yet. We'll replace it later on with
- * the real object once we commit.
- */
- ret = idr_alloc(&uobj->ufile->idr, NULL, 0,
- min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT);
- if (ret >= 0)
- uobj->id = ret;
-
- spin_unlock(&uobj->ufile->idr_lock);
- idr_preload_end();
-
- return ret < 0 ? ret : 0;
+ /*
+ * We start with allocating an idr pointing to NULL. This represents an
+ * object which isn't initialized yet. We'll replace it later on with
+ * the real object once we commit.
+ */
+ return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b,
+ GFP_KERNEL);
}
/* Returns the ib_uobject or an error. The caller should check for IS_ERR. */
@@ -324,29 +313,20 @@ lookup_get_idr_uobject(const struct uverbs_api_object *obj,
enum rdma_lookup_mode mode)
{
struct ib_uobject *uobj;
- unsigned long idrno = id;
if (id < 0 || id > ULONG_MAX)
return ERR_PTR(-EINVAL);
rcu_read_lock();
- /* object won't be released as we're protected in rcu */
- uobj = idr_find(&ufile->idr, idrno);
- if (!uobj) {
- uobj = ERR_PTR(-ENOENT);
- goto free;
- }
-
/*
* The idr_find is guaranteed to return a pointer to something that
* isn't freed yet, or NULL, as the free after idr_remove goes through
* kfree_rcu(). However the object may still have been released and
* kfree() could be called at any time.
*/
- if (!kref_get_unless_zero(&uobj->ref))
+ uobj = xa_load(&ufile->idr, id);
+ if (!uobj || !kref_get_unless_zero(&uobj->ref))
uobj = ERR_PTR(-ENOENT);
-
-free:
rcu_read_unlock();
return uobj;
}
@@ -393,12 +373,13 @@ lookup_get_fd_uobject(const struct uverbs_api_object *obj,
struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
struct ib_uverbs_file *ufile, s64 id,
- enum rdma_lookup_mode mode)
+ enum rdma_lookup_mode mode,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj;
int ret;
- if (IS_ERR(obj) && PTR_ERR(obj) == -ENOMSG) {
+ if (obj == ERR_PTR(-ENOMSG)) {
/* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */
uobj = lookup_get_idr_uobject(NULL, ufile, id, mode);
if (IS_ERR(uobj))
@@ -431,6 +412,8 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
ret = uverbs_try_lock_object(uobj, mode);
if (ret)
goto free;
+ if (attrs)
+ attrs->context = uobj->context;
return uobj;
free:
@@ -438,38 +421,6 @@ free:
uverbs_uobject_put(uobj);
return ERR_PTR(ret);
}
-struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type,
- u32 object_id,
- struct uverbs_attr_bundle *attrs)
-{
- struct ib_uobject *uobj;
-
- uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
- object_id, UVERBS_LOOKUP_READ);
- if (IS_ERR(uobj))
- return uobj;
-
- attrs->context = uobj->context;
-
- return uobj;
-}
-
-struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type,
- u32 object_id,
- struct uverbs_attr_bundle *attrs)
-{
- struct ib_uobject *uobj;
-
- uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
- object_id, UVERBS_LOOKUP_WRITE);
-
- if (IS_ERR(uobj))
- return uobj;
-
- attrs->context = uobj->context;
-
- return uobj;
-}
static struct ib_uobject *
alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
@@ -489,14 +440,12 @@ alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
RDMACG_RESOURCE_HCA_OBJECT);
if (ret)
- goto idr_remove;
+ goto remove;
return uobj;
-idr_remove:
- spin_lock(&ufile->idr_lock);
- idr_remove(&ufile->idr, uobj->id);
- spin_unlock(&ufile->idr_lock);
+remove:
+ xa_erase(&ufile->idr, uobj->id);
uobj_put:
uverbs_uobject_put(uobj);
return ERR_PTR(ret);
@@ -526,7 +475,8 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj,
}
struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
- struct ib_uverbs_file *ufile)
+ struct ib_uverbs_file *ufile,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *ret;
@@ -546,6 +496,8 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
up_read(&ufile->hw_destroy_rwsem);
return ret;
}
+ if (attrs)
+ attrs->context = ret->context;
return ret;
}
@@ -554,18 +506,17 @@ static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
RDMACG_RESOURCE_HCA_OBJECT);
- spin_lock(&uobj->ufile->idr_lock);
- idr_remove(&uobj->ufile->idr, uobj->id);
- spin_unlock(&uobj->ufile->idr_lock);
+ xa_erase(&uobj->ufile->idr, uobj->id);
}
static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
const struct uverbs_obj_idr_type *idr_type =
container_of(uobj->uapi_object->type_attrs,
struct uverbs_obj_idr_type, type);
- int ret = idr_type->destroy_object(uobj, why);
+ int ret = idr_type->destroy_object(uobj, why, attrs);
/*
* We can only fail gracefully if the user requested to destroy the
@@ -586,9 +537,7 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
static void remove_handle_idr_uobject(struct ib_uobject *uobj)
{
- spin_lock(&uobj->ufile->idr_lock);
- idr_remove(&uobj->ufile->idr, uobj->id);
- spin_unlock(&uobj->ufile->idr_lock);
+ xa_erase(&uobj->ufile->idr, uobj->id);
/* Matches the kref in alloc_commit_idr_uobject */
uverbs_uobject_put(uobj);
}
@@ -599,7 +548,8 @@ static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
}
static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
const struct uverbs_obj_fd_type *fd_type = container_of(
uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type);
@@ -618,17 +568,17 @@ static void remove_handle_fd_uobject(struct ib_uobject *uobj)
static int alloc_commit_idr_uobject(struct ib_uobject *uobj)
{
struct ib_uverbs_file *ufile = uobj->ufile;
+ void *old;
- spin_lock(&ufile->idr_lock);
/*
* We already allocated this IDR with a NULL object, so
* this shouldn't fail.
*
- * NOTE: Once we set the IDR we loose ownership of our kref on uobj.
+ * NOTE: Storing the uobj transfers our kref on uobj to the XArray.
* It will be put by remove_commit_idr_uobject()
*/
- WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id));
- spin_unlock(&ufile->idr_lock);
+ old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL);
+ WARN_ON(old != NULL);
return 0;
}
@@ -675,15 +625,16 @@ static int alloc_commit_fd_uobject(struct ib_uobject *uobj)
* caller can no longer assume uobj is valid. If this function fails it
* destroys the uboject, including the attached HW object.
*/
-int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj)
+int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs)
{
- struct ib_uverbs_file *ufile = uobj->ufile;
+ struct ib_uverbs_file *ufile = attrs->ufile;
int ret;
/* alloc_commit consumes the uobj kref */
ret = uobj->uapi_object->type_class->alloc_commit(uobj);
if (ret) {
- uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);
up_read(&ufile->hw_destroy_rwsem);
return ret;
}
@@ -707,12 +658,13 @@ int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj)
* This consumes the kref for uobj. It is up to the caller to unwind the HW
* object and anything else connected to uobj before calling this.
*/
-void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_file *ufile = uobj->ufile;
uobj->object = NULL;
- uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);
/* Matches the down_read in rdma_alloc_begin_uobject */
up_read(&ufile->hw_destroy_rwsem);
@@ -760,29 +712,28 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj,
void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)
{
- spin_lock_init(&ufile->idr_lock);
- idr_init(&ufile->idr);
+ xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC);
}
void release_ufile_idr_uobject(struct ib_uverbs_file *ufile)
{
struct ib_uobject *entry;
- int id;
+ unsigned long id;
/*
* At this point uverbs_cleanup_ufile() is guaranteed to have run, and
- * there are no HW objects left, however the IDR is still populated
+ * there are no HW objects left, however the xarray is still populated
* with anything that has not been cleaned up by userspace. Since the
* kref on ufile is 0, nothing is allowed to call lookup_get.
*
* This is an optimized equivalent to remove_handle_idr_uobject
*/
- idr_for_each_entry(&ufile->idr, entry, id) {
+ xa_for_each(&ufile->idr, id, entry) {
WARN_ON(entry->object);
uverbs_uobject_put(entry);
}
- idr_destroy(&ufile->idr);
+ xa_destroy(&ufile->idr);
}
const struct uverbs_obj_type_class uverbs_idr_class = {
@@ -814,6 +765,10 @@ void uverbs_close_fd(struct file *f)
{
struct ib_uobject *uobj = f->private_data;
struct ib_uverbs_file *ufile = uobj->ufile;
+ struct uverbs_attr_bundle attrs = {
+ .context = uobj->context,
+ .ufile = ufile,
+ };
if (down_read_trylock(&ufile->hw_destroy_rwsem)) {
/*
@@ -823,7 +778,7 @@ void uverbs_close_fd(struct file *f)
* write lock here, or we have a kernel bug.
*/
WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE));
- uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE);
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs);
up_read(&ufile->hw_destroy_rwsem);
}
@@ -872,6 +827,7 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
{
struct ib_uobject *obj, *next_obj;
int ret = -EINVAL;
+ struct uverbs_attr_bundle attrs = { .ufile = ufile };
/*
* This shouldn't run while executing other commands on this
@@ -883,12 +839,13 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
* other threads (which might still use the FDs) chance to run.
*/
list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) {
+ attrs.context = obj->context;
/*
* if we hit this WARN_ON, that means we are
* racing with a lookup_get.
*/
WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE));
- if (!uverbs_destroy_uobject(obj, reason))
+ if (!uverbs_destroy_uobject(obj, reason, &attrs))
ret = 0;
else
atomic_set(&obj->usecnt, 0);
@@ -967,26 +924,25 @@ const struct uverbs_obj_type_class uverbs_fd_class = {
EXPORT_SYMBOL(uverbs_fd_class);
struct ib_uobject *
-uverbs_get_uobject_from_file(u16 object_id,
- struct ib_uverbs_file *ufile,
- enum uverbs_obj_access access, s64 id)
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+ s64 id, struct uverbs_attr_bundle *attrs)
{
const struct uverbs_api_object *obj =
- uapi_get_object(ufile->device->uapi, object_id);
+ uapi_get_object(attrs->ufile->device->uapi, object_id);
switch (access) {
case UVERBS_ACCESS_READ:
- return rdma_lookup_get_uobject(obj, ufile, id,
- UVERBS_LOOKUP_READ);
+ return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_READ, attrs);
case UVERBS_ACCESS_DESTROY:
/* Actual destruction is done inside uverbs_handle_method */
- return rdma_lookup_get_uobject(obj, ufile, id,
- UVERBS_LOOKUP_DESTROY);
+ return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_DESTROY, attrs);
case UVERBS_ACCESS_WRITE:
- return rdma_lookup_get_uobject(obj, ufile, id,
- UVERBS_LOOKUP_WRITE);
+ return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_WRITE, attrs);
case UVERBS_ACCESS_NEW:
- return rdma_alloc_begin_uobject(obj, ufile);
+ return rdma_alloc_begin_uobject(obj, attrs->ufile, attrs);
default:
WARN_ON(true);
return ERR_PTR(-EOPNOTSUPP);
@@ -994,8 +950,8 @@ uverbs_get_uobject_from_file(u16 object_id,
}
int uverbs_finalize_object(struct ib_uobject *uobj,
- enum uverbs_obj_access access,
- bool commit)
+ enum uverbs_obj_access access, bool commit,
+ struct uverbs_attr_bundle *attrs)
{
int ret = 0;
@@ -1018,9 +974,9 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
break;
case UVERBS_ACCESS_NEW:
if (commit)
- ret = rdma_alloc_commit_uobject(uobj);
+ ret = rdma_alloc_commit_uobject(uobj, attrs);
else
- rdma_alloc_abort_uobject(uobj);
+ rdma_alloc_abort_uobject(uobj, attrs);
break;
default:
WARN_ON(true);
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 69f8db66925e..5445323629b5 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -48,7 +48,7 @@ struct ib_uverbs_device;
void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,
enum rdma_remove_reason reason);
-int uobj_destroy(struct ib_uobject *uobj);
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs);
/*
* uverbs_uobject_get is called in order to increase the reference count on
@@ -83,9 +83,8 @@ void uverbs_close_fd(struct file *f);
* uverbs_finalize_objects are called.
*/
struct ib_uobject *
-uverbs_get_uobject_from_file(u16 object_id,
- struct ib_uverbs_file *ufile,
- enum uverbs_obj_access access, s64 id);
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+ s64 id, struct uverbs_attr_bundle *attrs);
/*
* Note that certain finalize stages could return a status:
@@ -103,8 +102,8 @@ uverbs_get_uobject_from_file(u16 object_id,
* object.
*/
int uverbs_finalize_object(struct ib_uobject *uobj,
- enum uverbs_obj_access access,
- bool commit);
+ enum uverbs_obj_access access, bool commit,
+ struct uverbs_attr_bundle *attrs);
int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx);
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index bb534959abf0..7d8071c7e564 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -40,7 +40,7 @@
#include <linux/slab.h>
#include <linux/dma-mapping.h>
#include <linux/kref.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
#include <linux/workqueue.h>
#include <uapi/linux/if_ether.h>
#include <rdma/ib_pack.h>
@@ -183,8 +183,7 @@ static struct ib_client sa_client = {
.remove = ib_sa_remove_one
};
-static DEFINE_SPINLOCK(idr_lock);
-static DEFINE_IDR(query_idr);
+static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
static DEFINE_SPINLOCK(tid_lock);
static u32 tid;
@@ -1180,14 +1179,14 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
struct ib_mad_agent *agent;
struct ib_mad_send_buf *mad_buf;
- spin_lock_irqsave(&idr_lock, flags);
- if (idr_find(&query_idr, id) != query) {
- spin_unlock_irqrestore(&idr_lock, flags);
+ xa_lock_irqsave(&queries, flags);
+ if (xa_load(&queries, id) != query) {
+ xa_unlock_irqrestore(&queries, flags);
return;
}
agent = query->port->agent;
mad_buf = query->mad_buf;
- spin_unlock_irqrestore(&idr_lock, flags);
+ xa_unlock_irqrestore(&queries, flags);
/*
* If the query is still on the netlink request list, schedule
@@ -1363,21 +1362,14 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)
static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
gfp_t gfp_mask)
{
- bool preload = gfpflags_allow_blocking(gfp_mask);
unsigned long flags;
int ret, id;
- if (preload)
- idr_preload(gfp_mask);
- spin_lock_irqsave(&idr_lock, flags);
-
- id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
-
- spin_unlock_irqrestore(&idr_lock, flags);
- if (preload)
- idr_preload_end();
- if (id < 0)
- return id;
+ xa_lock_irqsave(&queries, flags);
+ ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask);
+ xa_unlock_irqrestore(&queries, flags);
+ if (ret < 0)
+ return ret;
query->mad_buf->timeout_ms = timeout_ms;
query->mad_buf->context[0] = query;
@@ -1394,9 +1386,9 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
ret = ib_post_send_mad(query->mad_buf, NULL);
if (ret) {
- spin_lock_irqsave(&idr_lock, flags);
- idr_remove(&query_idr, id);
- spin_unlock_irqrestore(&idr_lock, flags);
+ xa_lock_irqsave(&queries, flags);
+ __xa_erase(&queries, id);
+ xa_unlock_irqrestore(&queries, flags);
}
/*
@@ -2188,9 +2180,9 @@ static void send_handler(struct ib_mad_agent *agent,
break;
}
- spin_lock_irqsave(&idr_lock, flags);
- idr_remove(&query_idr, query->id);
- spin_unlock_irqrestore(&idr_lock, flags);
+ xa_lock_irqsave(&queries, flags);
+ __xa_erase(&queries, query->id);
+ xa_unlock_irqrestore(&queries, flags);
free_mad(query);
if (query->client)
@@ -2475,5 +2467,5 @@ void ib_sa_cleanup(void)
destroy_workqueue(ib_nl_wq);
mcast_cleanup();
ib_unregister_client(&sa_client);
- idr_destroy(&query_idr);
+ WARN_ON(!xa_empty(&queries));
}
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 9b6a065bdfa5..c78d0c9646ae 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -349,10 +349,15 @@ static struct attribute *port_default_attrs[] = {
static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf)
{
- if (!gid_attr->ndev)
- return -EINVAL;
-
- return sprintf(buf, "%s\n", gid_attr->ndev->name);
+ struct net_device *ndev;
+ size_t ret = -EINVAL;
+
+ rcu_read_lock();
+ ndev = rcu_dereference(gid_attr->ndev);
+ if (ndev)
+ ret = sprintf(buf, "%s\n", ndev->name);
+ rcu_read_unlock();
+ return ret;
}
static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf)
@@ -1015,8 +1020,10 @@ err_free_stats:
return;
}
-static int add_port(struct ib_device *device, int port_num)
+static int add_port(struct ib_core_device *coredev, int port_num)
{
+ struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+ bool is_full_dev = &device->coredev == coredev;
struct ib_port *p;
struct ib_port_attr attr;
int i;
@@ -1034,7 +1041,7 @@ static int add_port(struct ib_device *device, int port_num)
p->port_num = port_num;
ret = kobject_init_and_add(&p->kobj, &port_type,
- device->ports_kobj,
+ coredev->ports_kobj,
"%d", port_num);
if (ret) {
kfree(p);
@@ -1055,7 +1062,7 @@ static int add_port(struct ib_device *device, int port_num)
goto err_put;
}
- if (device->ops.process_mad) {
+ if (device->ops.process_mad && is_full_dev) {
p->pma_table = get_counter_table(device, port_num);
ret = sysfs_create_group(&p->kobj, p->pma_table);
if (ret)
@@ -1111,7 +1118,7 @@ static int add_port(struct ib_device *device, int port_num)
if (ret)
goto err_free_pkey;
- if (device->ops.init_port) {
+ if (device->ops.init_port && is_full_dev) {
ret = device->ops.init_port(device, port_num, &p->kobj);
if (ret)
goto err_remove_pkey;
@@ -1122,10 +1129,10 @@ static int add_port(struct ib_device *device, int port_num)
* port, so holder should be device. Therefore skip per port conunter
* initialization.
*/
- if (device->ops.alloc_hw_stats && port_num)
+ if (device->ops.alloc_hw_stats && port_num && is_full_dev)
setup_hw_stats(device, p, port_num);
- list_add_tail(&p->kobj.entry, &device->port_list);
+ list_add_tail(&p->kobj.entry, &coredev->port_list);
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
@@ -1194,6 +1201,7 @@ static ssize_t node_type_show(struct device *device,
case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type);
case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type);
case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+ case RDMA_NODE_UNSPECIFIED: return sprintf(buf, "%d: unspecified\n", dev->node_type);
case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
@@ -1279,11 +1287,11 @@ const struct attribute_group ib_dev_attr_group = {
.attrs = ib_dev_attrs,
};
-static void ib_free_port_attrs(struct ib_device *device)
+void ib_free_port_attrs(struct ib_core_device *coredev)
{
struct kobject *p, *t;
- list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
struct ib_port *port = container_of(p, struct ib_port, kobj);
list_del(&p->entry);
@@ -1303,20 +1311,22 @@ static void ib_free_port_attrs(struct ib_device *device)
kobject_put(p);
}
- kobject_put(device->ports_kobj);
+ kobject_put(coredev->ports_kobj);
}
-static int ib_setup_port_attrs(struct ib_device *device)
+int ib_setup_port_attrs(struct ib_core_device *coredev)
{
+ struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
unsigned int port;
int ret;
- device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj);
- if (!device->ports_kobj)
+ coredev->ports_kobj = kobject_create_and_add("ports",
+ &coredev->dev.kobj);
+ if (!coredev->ports_kobj)
return -ENOMEM;
rdma_for_each_port (device, port) {
- ret = add_port(device, port);
+ ret = add_port(coredev, port);
if (ret)
goto err_put;
}
@@ -1324,7 +1334,7 @@ static int ib_setup_port_attrs(struct ib_device *device)
return 0;
err_put:
- ib_free_port_attrs(device);
+ ib_free_port_attrs(coredev);
return ret;
}
@@ -1332,7 +1342,7 @@ int ib_device_register_sysfs(struct ib_device *device)
{
int ret;
- ret = ib_setup_port_attrs(device);
+ ret = ib_setup_port_attrs(&device->coredev);
if (ret)
return ret;
@@ -1348,5 +1358,48 @@ void ib_device_unregister_sysfs(struct ib_device *device)
free_hsag(&device->dev.kobj, device->hw_stats_ag);
kfree(device->hw_stats);
- ib_free_port_attrs(device);
+ ib_free_port_attrs(&device->coredev);
+}
+
+/**
+ * ib_port_register_module_stat - add module counters under relevant port
+ * of IB device.
+ *
+ * @device: IB device to add counters
+ * @port_num: valid port number
+ * @kobj: pointer to the kobject to initialize
+ * @ktype: pointer to the ktype for this kobject.
+ * @name: the name of the kobject
+ */
+int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+ struct kobject *kobj, struct kobj_type *ktype,
+ const char *name)
+{
+ struct kobject *p, *t;
+ int ret;
+
+ list_for_each_entry_safe(p, t, &device->coredev.port_list, entry) {
+ struct ib_port *port = container_of(p, struct ib_port, kobj);
+
+ if (port->port_num != port_num)
+ continue;
+
+ ret = kobject_init_and_add(kobj, ktype, &port->kobj, "%s",
+ name);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_port_register_module_stat);
+
+/**
+ * ib_port_unregister_module_stat - release module counters
+ * @kobj: pointer to the kobject to release
+ */
+void ib_port_unregister_module_stat(struct kobject *kobj)
+{
+ kobject_put(kobj);
}
+EXPORT_SYMBOL(ib_port_unregister_module_stat);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 65c3230f5663..8e7da2d41fd8 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -42,7 +42,7 @@
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/cdev.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
#include <linux/mutex.h>
#include <linux/slab.h>
@@ -125,23 +125,22 @@ static struct ib_client ucm_client = {
.remove = ib_ucm_remove_one
};
-static DEFINE_MUTEX(ctx_id_mutex);
-static DEFINE_IDR(ctx_id_table);
+static DEFINE_XARRAY_ALLOC(ctx_id_table);
static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
{
struct ib_ucm_context *ctx;
- mutex_lock(&ctx_id_mutex);
- ctx = idr_find(&ctx_id_table, id);
+ xa_lock(&ctx_id_table);
+ ctx = xa_load(&ctx_id_table, id);
if (!ctx)
ctx = ERR_PTR(-ENOENT);
else if (ctx->file != file)
ctx = ERR_PTR(-EINVAL);
else
atomic_inc(&ctx->ref);
- mutex_unlock(&ctx_id_mutex);
+ xa_unlock(&ctx_id_table);
return ctx;
}
@@ -194,10 +193,7 @@ static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
ctx->file = file;
INIT_LIST_HEAD(&ctx->events);
- mutex_lock(&ctx_id_mutex);
- ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
- mutex_unlock(&ctx_id_mutex);
- if (ctx->id < 0)
+ if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
goto error;
list_add_tail(&ctx->file_list, &file->ctxs);
@@ -514,9 +510,7 @@ static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
err2:
ib_destroy_cm_id(ctx->cm_id);
err1:
- mutex_lock(&ctx_id_mutex);
- idr_remove(&ctx_id_table, ctx->id);
- mutex_unlock(&ctx_id_mutex);
+ xa_erase(&ctx_id_table, ctx->id);
kfree(ctx);
return result;
}
@@ -536,15 +530,15 @@ static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
return -EFAULT;
- mutex_lock(&ctx_id_mutex);
- ctx = idr_find(&ctx_id_table, cmd.id);
+ xa_lock(&ctx_id_table);
+ ctx = xa_load(&ctx_id_table, cmd.id);
if (!ctx)
ctx = ERR_PTR(-ENOENT);
else if (ctx->file != file)
ctx = ERR_PTR(-EINVAL);
else
- idr_remove(&ctx_id_table, ctx->id);
- mutex_unlock(&ctx_id_mutex);
+ __xa_erase(&ctx_id_table, ctx->id);
+ xa_unlock(&ctx_id_table);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
@@ -1189,10 +1183,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)
struct ib_ucm_context, file_list);
mutex_unlock(&file->file_mutex);
- mutex_lock(&ctx_id_mutex);
- idr_remove(&ctx_id_table, ctx->id);
- mutex_unlock(&ctx_id_mutex);
-
+ xa_erase(&ctx_id_table, ctx->id);
ib_destroy_cm_id(ctx->cm_id);
ib_ucm_cleanup_events(ctx);
kfree(ctx);
@@ -1352,7 +1343,7 @@ static void __exit ib_ucm_cleanup(void)
class_remove_file(&cm_class, &class_attr_abi_version.attr);
unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
- idr_destroy(&ctx_id_table);
+ WARN_ON(!xa_empty(&ctx_id_table));
}
module_init(ib_ucm_init);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index fe5551562dbc..0a23048db523 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -37,27 +37,23 @@
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/export.h>
-#include <linux/hugetlb.h>
#include <linux/slab.h>
+#include <linux/pagemap.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
-
static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
{
- struct scatterlist *sg;
+ struct sg_page_iter sg_iter;
struct page *page;
- int i;
if (umem->nmap > 0)
- ib_dma_unmap_sg(dev, umem->sg_head.sgl,
- umem->npages,
+ ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
DMA_BIDIRECTIONAL);
- for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
- page = sg_page(sg);
+ for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
+ page = sg_page_iter_page(&sg_iter);
if (!PageDirty(page) && umem->writable && dirty)
set_page_dirty_lock(page);
put_page(page);
@@ -66,6 +62,124 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
sg_free_table(&umem->sg_head);
}
+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * sg: current scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ * max_seg_sz: maximum segment size in bytes
+ * nents: [out] number of entries in the scatterlist
+ *
+ * Return new end of scatterlist
+ */
+static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
+ struct page **page_list,
+ unsigned long npages,
+ unsigned int max_seg_sz,
+ int *nents)
+{
+ unsigned long first_pfn;
+ unsigned long i = 0;
+ bool update_cur_sg = false;
+ bool first = !sg_page(sg);
+
+ /* Check if new page_list is contiguous with end of previous page_list.
+ * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
+ */
+ if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
+ page_to_pfn(page_list[0])))
+ update_cur_sg = true;
+
+ while (i != npages) {
+ unsigned long len;
+ struct page *first_page = page_list[i];
+
+ first_pfn = page_to_pfn(first_page);
+
+ /* Compute the number of contiguous pages we have starting
+ * at i
+ */
+ for (len = 0; i != npages &&
+ first_pfn + len == page_to_pfn(page_list[i]) &&
+ len < (max_seg_sz >> PAGE_SHIFT);
+ len++)
+ i++;
+
+ /* Squash N contiguous pages from page_list into current sge */
+ if (update_cur_sg) {
+ if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
+ sg_set_page(sg, sg_page(sg),
+ sg->length + (len << PAGE_SHIFT),
+ 0);
+ update_cur_sg = false;
+ continue;
+ }
+ update_cur_sg = false;
+ }
+
+ /* Squash N contiguous pages into next sge or first sge */
+ if (!first)
+ sg = sg_next(sg);
+
+ (*nents)++;
+ sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
+ first = false;
+ }
+
+ return sg;
+}
+
+/**
+ * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
+ *
+ * @umem: umem struct
+ * @pgsz_bitmap: bitmap of HW supported page sizes
+ * @virt: IOVA
+ *
+ * This helper is intended for HW that support multiple page
+ * sizes but can do only a single page size in an MR.
+ *
+ * Returns 0 if the umem requires page sizes not supported by
+ * the driver to be mapped. Drivers always supporting PAGE_SIZE
+ * or smaller will never see a 0 result.
+ */
+unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
+ unsigned long pgsz_bitmap,
+ unsigned long virt)
+{
+ struct scatterlist *sg;
+ unsigned int best_pg_bit;
+ unsigned long va, pgoff;
+ dma_addr_t mask;
+ int i;
+
+ /* At minimum, drivers must support PAGE_SIZE or smaller */
+ if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
+ return 0;
+
+ va = virt;
+ /* max page size not to exceed MR length */
+ mask = roundup_pow_of_two(umem->length);
+ /* offset into first SGL */
+ pgoff = umem->address & ~PAGE_MASK;
+
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+ /* Walk SGL and reduce max page size if VA/PA bits differ
+ * for any address.
+ */
+ mask |= (sg_dma_address(sg) + pgoff) ^ va;
+ if (i && i != (umem->nmap - 1))
+ /* restrict by length as well for interior SGEs */
+ mask |= sg_dma_len(sg);
+ va += sg_dma_len(sg) - pgoff;
+ pgoff = 0;
+ }
+ best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap);
+
+ return BIT_ULL(best_pg_bit);
+}
+EXPORT_SYMBOL(ib_umem_find_best_pgsz);
+
/**
* ib_umem_get - Pin and DMA map userspace memory.
*
@@ -84,16 +198,14 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct ib_ucontext *context;
struct ib_umem *umem;
struct page **page_list;
- struct vm_area_struct **vma_list;
unsigned long lock_limit;
unsigned long new_pinned;
unsigned long cur_base;
struct mm_struct *mm;
unsigned long npages;
int ret;
- int i;
unsigned long dma_attrs = 0;
- struct scatterlist *sg, *sg_list_start;
+ struct scatterlist *sg;
unsigned int gup_flags = FOLL_WRITE;
if (!udata)
@@ -138,29 +250,23 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
mmgrab(mm);
if (access & IB_ACCESS_ON_DEMAND) {
+ if (WARN_ON_ONCE(!context->invalidate_range)) {
+ ret = -EINVAL;
+ goto umem_kfree;
+ }
+
ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);
if (ret)
goto umem_kfree;
return umem;
}
- /* We assume the memory is from hugetlb until proved otherwise */
- umem->hugetlb = 1;
-
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list) {
ret = -ENOMEM;
goto umem_kfree;
}
- /*
- * if we can't alloc the vma_list, it's not so bad;
- * just assume the memory is not hugetlb memory
- */
- vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
- if (!vma_list)
- umem->hugetlb = 0;
-
npages = ib_umem_num_pages(umem);
if (npages == 0 || npages > UINT_MAX) {
ret = -EINVAL;
@@ -185,41 +291,34 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
if (!umem->writable)
gup_flags |= FOLL_FORCE;
- sg_list_start = umem->sg_head.sgl;
+ sg = umem->sg_head.sgl;
while (npages) {
down_read(&mm->mmap_sem);
ret = get_user_pages_longterm(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof (struct page *)),
- gup_flags, page_list, vma_list);
+ gup_flags, page_list, NULL);
if (ret < 0) {
up_read(&mm->mmap_sem);
goto umem_release;
}
- umem->npages += ret;
cur_base += ret * PAGE_SIZE;
npages -= ret;
- /* Continue to hold the mmap_sem as vma_list access
- * needs to be protected.
- */
- for_each_sg(sg_list_start, sg, ret, i) {
- if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
- umem->hugetlb = 0;
+ sg = ib_umem_add_sg_table(sg, page_list, ret,
+ dma_get_max_seg_size(context->device->dma_device),
+ &umem->sg_nents);
- sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
- }
up_read(&mm->mmap_sem);
-
- /* preparing for next loop */
- sg_list_start = sg;
}
+ sg_mark_end(sg);
+
umem->nmap = ib_dma_map_sg_attrs(context->device,
umem->sg_head.sgl,
- umem->npages,
+ umem->sg_nents,
DMA_BIDIRECTIONAL,
dma_attrs);
@@ -236,8 +335,6 @@ umem_release:
vma:
atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
out:
- if (vma_list)
- free_page((unsigned long) vma_list);
free_page((unsigned long) page_list);
umem_kfree:
if (ret) {
@@ -315,7 +412,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
return -EINVAL;
}
- ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
+ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
offset + ib_umem_offset(umem));
if (ret < 0)
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index e6ec79ad9cc8..c7226cf52acc 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -241,7 +241,7 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
per_mm->mm = mm;
per_mm->umem_tree = RB_ROOT_CACHED;
init_rwsem(&per_mm->umem_rwsem);
- per_mm->active = ctx->invalidate_range;
+ per_mm->active = true;
rcu_read_lock();
per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
@@ -417,9 +417,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
h = hstate_vma(vma);
umem->page_shift = huge_page_shift(h);
up_read(&mm->mmap_sem);
- umem->hugetlb = 1;
- } else {
- umem->hugetlb = 0;
}
mutex_init(&umem_odp->umem_mutex);
@@ -503,7 +500,6 @@ static int ib_umem_odp_map_dma_single_page(
struct ib_umem *umem = &umem_odp->umem;
struct ib_device *dev = umem->context->device;
dma_addr_t dma_addr;
- int stored_page = 0;
int remove_existing_mapping = 0;
int ret = 0;
@@ -527,8 +523,7 @@ static int ib_umem_odp_map_dma_single_page(
}
umem_odp->dma_list[page_index] = dma_addr | access_mask;
umem_odp->page_list[page_index] = page;
- umem->npages++;
- stored_page = 1;
+ umem_odp->npages++;
} else if (umem_odp->page_list[page_index] == page) {
umem_odp->dma_list[page_index] |= access_mask;
} else {
@@ -540,11 +535,9 @@ static int ib_umem_odp_map_dma_single_page(
}
out:
- /* On Demand Paging - avoid pinning the page */
- if (umem->context->invalidate_range || !stored_page)
- put_page(page);
+ put_page(page);
- if (remove_existing_mapping && umem->context->invalidate_range) {
+ if (remove_existing_mapping) {
ib_umem_notifier_start_account(umem_odp);
umem->context->invalidate_range(
umem_odp,
@@ -754,12 +747,9 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
*/
set_page_dirty(head_page);
}
- /* on demand pinning support */
- if (!umem->context->invalidate_range)
- put_page(page);
umem_odp->page_list[idx] = NULL;
umem_odp->dma_list[idx] = 0;
- umem->npages--;
+ umem_odp->npages--;
}
}
mutex_unlock(&umem_odp->umem_mutex);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index b58b07c03cfb..671f07ba1fad 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -129,6 +129,9 @@ struct ib_umad_packet {
struct ib_user_mad mad;
};
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_umad.h>
+
static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
IB_UMAD_NUM_FIXED_MINOR;
@@ -334,6 +337,9 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
return -EFAULT;
}
}
+
+ trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr);
+
return hdr_size(file) + packet->length;
}
@@ -353,6 +359,9 @@ static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
if (copy_to_user(buf, packet->mad.data, packet->length))
return -EFAULT;
+ trace_ib_umad_read_send(file, &packet->mad.hdr,
+ (struct ib_mad_hdr *)&packet->mad.data);
+
return size;
}
@@ -508,6 +517,9 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
mutex_lock(&file->mutex);
+ trace_ib_umad_write(file, &packet->mad.hdr,
+ (struct ib_mad_hdr *)&packet->mad.data);
+
agent = __get_agent(file, packet->mad.hdr.id);
if (!agent) {
ret = -EINVAL;
@@ -968,6 +980,11 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
goto out;
}
+ if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+ ret = -EPERM;
+ goto out;
+ }
+
file = kzalloc(sizeof(*file), GFP_KERNEL);
if (!file) {
ret = -ENOMEM;
@@ -1061,6 +1078,11 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
}
}
+ if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+ ret = -EPERM;
+ goto err_up_sem;
+ }
+
ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
if (ret)
goto err_up_sem;
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 32cc8fe7902f..1e5aeb39f774 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -162,9 +162,7 @@ struct ib_uverbs_file {
struct list_head umaps;
struct page *disassociate_page;
- struct idr idr;
- /* spinlock protects write access to idr */
- spinlock_t idr_lock;
+ struct xarray idr;
};
struct ib_uverbs_event {
@@ -241,7 +239,8 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);
int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
- enum rdma_remove_reason why);
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs);
int uverbs_dealloc_mw(struct ib_mw *mw);
void ib_uverbs_detach_umcast(struct ib_qp *qp,
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 062a86c04123..5a3a1780ceea 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -162,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
const void __user *res = iter->cur;
if (iter->cur + len > iter->end)
- return ERR_PTR(-ENOSPC);
+ return (void __force __user *)ERR_PTR(-ENOSPC);
iter->cur += len;
return res;
}
@@ -175,7 +175,7 @@ static int uverbs_request_finish(struct uverbs_req_iter *iter)
}
static struct ib_uverbs_completion_event_file *
-_ib_uverbs_lookup_comp_file(s32 fd, const struct uverbs_attr_bundle *attrs)
+_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
{
struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL,
fd, attrs);
@@ -230,6 +230,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
goto err_alloc;
}
+ attrs->context = ucontext;
+
ucontext->res.type = RDMA_RESTRACK_CTX;
ucontext->device = ib_dev;
ucontext->cg_obj = cg_obj;
@@ -423,7 +425,7 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
atomic_set(&pd->usecnt, 0);
pd->res.type = RDMA_RESTRACK_PD;
- ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata);
+ ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata);
if (ret)
goto err_alloc;
@@ -436,15 +438,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
if (ret)
goto err_copy;
- return uobj_alloc_commit(uobj);
+ return uobj_alloc_commit(uobj, attrs);
err_copy:
- ib_dealloc_pd(pd);
+ ib_dealloc_pd_user(pd, &attrs->driver_udata);
pd = NULL;
err_alloc:
kfree(pd);
err:
- uobj_alloc_abort(uobj);
+ uobj_alloc_abort(uobj, attrs);
return ret;
}
@@ -594,8 +596,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
}
if (!xrcd) {
- xrcd = ib_dev->ops.alloc_xrcd(ib_dev, obj->uobject.context,
- &attrs->driver_udata);
+ xrcd = ib_dev->ops.alloc_xrcd(ib_dev, &attrs->driver_udata);
if (IS_ERR(xrcd)) {
ret = PTR_ERR(xrcd);
goto err;
@@ -633,7 +634,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
mutex_unlock(&ibudev->xrcd_tree_mutex);
- return uobj_alloc_commit(&obj->uobject);
+ return uobj_alloc_commit(&obj->uobject, attrs);
err_copy:
if (inode) {
@@ -643,10 +644,10 @@ err_copy:
}
err_dealloc_xrcd:
- ib_dealloc_xrcd(xrcd);
+ ib_dealloc_xrcd(xrcd, &attrs->driver_udata);
err:
- uobj_alloc_abort(&obj->uobject);
+ uobj_alloc_abort(&obj->uobject, attrs);
err_tree_mutex_unlock:
if (f.file)
@@ -669,19 +670,19 @@ static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs)
return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs);
}
-int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject,
- struct ib_xrcd *xrcd,
- enum rdma_remove_reason why)
+int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct inode *inode;
int ret;
- struct ib_uverbs_device *dev = uobject->context->ufile->device;
+ struct ib_uverbs_device *dev = attrs->ufile->device;
inode = xrcd->inode;
if (inode && !atomic_dec_and_test(&xrcd->usecnt))
return 0;
- ret = ib_dealloc_xrcd(xrcd);
+ ret = ib_dealloc_xrcd(xrcd, &attrs->driver_udata);
if (ib_is_destroy_retryable(ret, why, uobject)) {
atomic_inc(&xrcd->usecnt);
@@ -763,16 +764,16 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
uobj_put_obj_read(pd);
- return uobj_alloc_commit(uobj);
+ return uobj_alloc_commit(uobj, attrs);
err_copy:
- ib_dereg_mr(mr);
+ ib_dereg_mr_user(mr, &attrs->driver_udata);
err_put:
uobj_put_obj_read(pd);
err_free:
- uobj_alloc_abort(uobj);
+ uobj_alloc_abort(uobj, attrs);
return ret;
}
@@ -917,14 +918,14 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
goto err_copy;
uobj_put_obj_read(pd);
- return uobj_alloc_commit(uobj);
+ return uobj_alloc_commit(uobj, attrs);
err_copy:
uverbs_dealloc_mw(mw);
err_put:
uobj_put_obj_read(pd);
err_free:
- uobj_alloc_abort(uobj);
+ uobj_alloc_abort(uobj, attrs);
return ret;
}
@@ -965,11 +966,11 @@ static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs)
ret = uverbs_response(attrs, &resp, sizeof(resp));
if (ret) {
- uobj_alloc_abort(uobj);
+ uobj_alloc_abort(uobj, attrs);
return ret;
}
- return uobj_alloc_commit(uobj);
+ return uobj_alloc_commit(uobj, attrs);
}
static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
@@ -1009,8 +1010,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
attr.comp_vector = cmd->comp_vector;
attr.flags = cmd->flags;
- cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context,
- &attrs->driver_udata);
+ cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
@@ -1036,7 +1036,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
if (ret)
goto err_cb;
- ret = uobj_alloc_commit(&obj->uobject);
+ ret = uobj_alloc_commit(&obj->uobject, attrs);
if (ret)
return ERR_PTR(ret);
return obj;
@@ -1049,7 +1049,7 @@ err_file:
ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);
err:
- uobj_alloc_abort(&obj->uobject);
+ uobj_alloc_abort(&obj->uobject, attrs);
return ERR_PTR(ret);
}
@@ -1418,7 +1418,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
if (ret)
goto err_cb;
- qp->real_qp = qp;
qp->pd = pd;
qp->send_cq = attr.send_cq;
qp->recv_cq = attr.recv_cq;
@@ -1477,7 +1476,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
if (ind_tbl)
uobj_put_obj_read(ind_tbl);
- return uobj_alloc_commit(&obj->uevent.uobject);
+ return uobj_alloc_commit(&obj->uevent.uobject, attrs);
err_cb:
ib_destroy_qp(qp);
@@ -1495,7 +1494,7 @@ err_put:
if (ind_tbl)
uobj_put_obj_read(ind_tbl);
- uobj_alloc_abort(&obj->uevent.uobject);
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
return ret;
}
@@ -1609,14 +1608,14 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)
qp->uobject = &obj->uevent.uobject;
uobj_put_read(xrcd_uobj);
- return uobj_alloc_commit(&obj->uevent.uobject);
+ return uobj_alloc_commit(&obj->uevent.uobject, attrs);
err_destroy:
ib_destroy_qp(qp);
err_xrcd:
uobj_put_read(xrcd_uobj);
err_put:
- uobj_alloc_abort(&obj->uevent.uobject);
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
return ret;
}
@@ -2451,7 +2450,7 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
goto err_copy;
uobj_put_obj_read(pd);
- return uobj_alloc_commit(uobj);
+ return uobj_alloc_commit(uobj, attrs);
err_copy:
rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
@@ -2460,7 +2459,7 @@ err_put:
uobj_put_obj_read(pd);
err:
- uobj_alloc_abort(uobj);
+ uobj_alloc_abort(uobj, attrs);
return ret;
}
@@ -2962,16 +2961,16 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
uobj_put_obj_read(pd);
uobj_put_obj_read(cq);
- return uobj_alloc_commit(&obj->uevent.uobject);
+ return uobj_alloc_commit(&obj->uevent.uobject, attrs);
err_copy:
- ib_destroy_wq(wq);
+ ib_destroy_wq(wq, &attrs->driver_udata);
err_put_cq:
uobj_put_obj_read(cq);
err_put_pd:
uobj_put_obj_read(pd);
err_uobj:
- uobj_alloc_abort(&obj->uevent.uobject);
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
return err;
}
@@ -3136,12 +3135,12 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
for (j = 0; j < num_read_wqs; j++)
uobj_put_obj_read(wqs[j]);
- return uobj_alloc_commit(uobj);
+ return uobj_alloc_commit(uobj, attrs);
err_copy:
ib_destroy_rwq_ind_table(rwq_ind_tbl);
err_uobj:
- uobj_alloc_abort(uobj);
+ uobj_alloc_abort(uobj, attrs);
put_wqs:
for (j = 0; j < num_read_wqs; j++)
uobj_put_obj_read(wqs[j]);
@@ -3314,7 +3313,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
kfree(flow_attr);
if (cmd.flow_attr.num_of_specs)
kfree(kern_flow_attr);
- return uobj_alloc_commit(uobj);
+ return uobj_alloc_commit(uobj, attrs);
err_copy:
if (!qp->device->ops.destroy_flow(flow_id))
atomic_dec(&qp->usecnt);
@@ -3325,7 +3324,7 @@ err_free_flow_attr:
err_put:
uobj_put_obj_read(qp);
err_uobj:
- uobj_alloc_abort(uobj);
+ uobj_alloc_abort(uobj, attrs);
err_free_attr:
if (cmd.flow_attr.num_of_specs)
kfree(kern_flow_attr);
@@ -3411,9 +3410,9 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
obj->uevent.events_reported = 0;
INIT_LIST_HEAD(&obj->uevent.event_list);
- srq = pd->device->ops.create_srq(pd, &attr, udata);
- if (IS_ERR(srq)) {
- ret = PTR_ERR(srq);
+ srq = rdma_zalloc_drv_obj(ib_dev, ib_srq);
+ if (!srq) {
+ ret = -ENOMEM;
goto err_put;
}
@@ -3424,6 +3423,10 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
srq->event_handler = attr.event_handler;
srq->srq_context = attr.srq_context;
+ ret = pd->device->ops.create_srq(srq, &attr, udata);
+ if (ret)
+ goto err_free;
+
if (ib_srq_has_cq(cmd->srq_type)) {
srq->ext.cq = attr.ext.cq;
atomic_inc(&attr.ext.cq->usecnt);
@@ -3458,11 +3461,13 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
uobj_put_obj_read(attr.ext.cq);
uobj_put_obj_read(pd);
- return uobj_alloc_commit(&obj->uevent.uobject);
+ return uobj_alloc_commit(&obj->uevent.uobject, attrs);
err_copy:
- ib_destroy_srq(srq);
+ ib_destroy_srq_user(srq, &attrs->driver_udata);
+err_free:
+ kfree(srq);
err_put:
uobj_put_obj_read(pd);
@@ -3477,7 +3482,7 @@ err_put_xrcd:
}
err:
- uobj_alloc_abort(&obj->uevent.uobject);
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
return ret;
}
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index e1379949e663..829b0c6944d8 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -207,13 +207,12 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
for (i = 0; i != array_len; i++) {
attr->uobjects[i] = uverbs_get_uobject_from_file(
- spec->u2.objs_arr.obj_type, pbundle->bundle.ufile,
- spec->u2.objs_arr.access, idr_vals[i]);
+ spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access,
+ idr_vals[i], &pbundle->bundle);
if (IS_ERR(attr->uobjects[i])) {
ret = PTR_ERR(attr->uobjects[i]);
break;
}
- pbundle->bundle.context = attr->uobjects[i]->context;
}
attr->len = i;
@@ -223,7 +222,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,
struct uverbs_objs_arr_attr *attr,
- bool commit)
+ bool commit, struct uverbs_attr_bundle *attrs)
{
const struct uverbs_attr_spec *spec = &attr_uapi->spec;
int current_ret;
@@ -231,8 +230,9 @@ static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,
size_t i;
for (i = 0; i != attr->len; i++) {
- current_ret = uverbs_finalize_object(
- attr->uobjects[i], spec->u2.objs_arr.access, commit);
+ current_ret = uverbs_finalize_object(attr->uobjects[i],
+ spec->u2.objs_arr.access,
+ commit, attrs);
if (!ret)
ret = current_ret;
}
@@ -325,13 +325,10 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
* IDR implementation today rejects negative IDs
*/
o_attr->uobject = uverbs_get_uobject_from_file(
- spec->u.obj.obj_type,
- pbundle->bundle.ufile,
- spec->u.obj.access,
- uattr->data_s64);
+ spec->u.obj.obj_type, spec->u.obj.access,
+ uattr->data_s64, &pbundle->bundle);
if (IS_ERR(o_attr->uobject))
return PTR_ERR(o_attr->uobject);
- pbundle->bundle.context = o_attr->uobject->context;
__set_bit(attr_bkey, pbundle->uobj_finalize);
if (spec->u.obj.access == UVERBS_ACCESS_NEW) {
@@ -456,12 +453,14 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
uverbs_fill_udata(&pbundle->bundle,
&pbundle->bundle.driver_udata,
UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
+ else
+ pbundle->bundle.driver_udata = (struct ib_udata){};
if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
struct uverbs_obj_attr *destroy_attr =
&pbundle->bundle.attrs[destroy_bkey].obj_attr;
- ret = uobj_destroy(destroy_attr->uobject);
+ ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);
if (ret)
return ret;
__clear_bit(destroy_bkey, pbundle->uobj_finalize);
@@ -512,7 +511,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)
current_ret = uverbs_finalize_object(
attr->obj_attr.uobject,
- attr->obj_attr.attr_elm->spec.u.obj.access, commit);
+ attr->obj_attr.attr_elm->spec.u.obj.access, commit,
+ &pbundle->bundle);
if (!ret)
ret = current_ret;
}
@@ -535,7 +535,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)
if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
current_ret = uverbs_free_idrs_array(
- attr_uapi, &attr->objs_arr_attr, commit);
+ attr_uapi, &attr->objs_arr_attr, commit,
+ &pbundle->bundle);
if (!ret)
ret = current_ret;
}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 8b43dd96d3b2..84a5e9a6d483 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -723,7 +723,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
* then the command request structure starts
* with a '__aligned u64 response' member.
*/
- ret = get_user(response, (const u64 *)buf);
+ ret = get_user(response, (const u64 __user *)buf);
if (ret)
goto out_unlock;
@@ -926,43 +926,32 @@ static const struct vm_operations_struct rdma_umap_ops = {
.fault = rdma_umap_fault,
};
-static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
- struct vm_area_struct *vma,
- unsigned long size)
+/*
+ * Map IO memory into a process. This is to be called by drivers as part of
+ * their mmap() functions if they wish to send something like PCI-E BAR memory
+ * to userspace.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
struct ib_uverbs_file *ufile = ucontext->ufile;
struct rdma_umap_priv *priv;
if (!(vma->vm_flags & VM_SHARED))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (vma->vm_end - vma->vm_start != size)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
/* Driver is using this wrong, must be called by ib_uverbs_mmap */
if (WARN_ON(!vma->vm_file ||
vma->vm_file->private_data != ufile))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
lockdep_assert_held(&ufile->device->disassociate_srcu);
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
- return ERR_PTR(-ENOMEM);
- return priv;
-}
-
-/*
- * Map IO memory into a process. This is to be called by drivers as part of
- * their mmap() functions if they wish to send something like PCI-E BAR memory
- * to userspace.
- */
-int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
- unsigned long pfn, unsigned long size, pgprot_t prot)
-{
- struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
-
- if (IS_ERR(priv))
- return PTR_ERR(priv);
+ return -ENOMEM;
vma->vm_page_prot = prot;
if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
@@ -975,35 +964,6 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
}
EXPORT_SYMBOL(rdma_user_mmap_io);
-/*
- * The page case is here for a slightly different reason, the driver expects
- * to be able to free the page it is sharing to user space when it destroys
- * its ucontext, which means we need to zap the user space references.
- *
- * We could handle this differently by providing an API to allocate a shared
- * page and then only freeing the shared page when the last ufile is
- * destroyed.
- */
-int rdma_user_mmap_page(struct ib_ucontext *ucontext,
- struct vm_area_struct *vma, struct page *page,
- unsigned long size)
-{
- struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
-
- if (IS_ERR(priv))
- return PTR_ERR(priv);
-
- if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size,
- vma->vm_page_prot)) {
- kfree(priv);
- return -EAGAIN;
- }
-
- rdma_umap_priv_init(priv, vma);
- return 0;
-}
-EXPORT_SYMBOL(rdma_user_mmap_page);
-
void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
{
struct rdma_umap_priv *priv, *next_priv;
@@ -1094,6 +1054,11 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
goto err;
}
+ if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) {
+ ret = -EPERM;
+ goto err;
+ }
+
/* In case IB device supports disassociate ucontext, there is no hard
* dependency between uverbs device and its low level device.
*/
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index f224cb727224..35b2e2c640cc 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -40,14 +40,17 @@
#include "uverbs.h"
static int uverbs_free_ah(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
- return rdma_destroy_ah((struct ib_ah *)uobject->object,
- RDMA_DESTROY_AH_SLEEPABLE);
+ return rdma_destroy_ah_user((struct ib_ah *)uobject->object,
+ RDMA_DESTROY_AH_SLEEPABLE,
+ &attrs->driver_udata);
}
static int uverbs_free_flow(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_flow *flow = (struct ib_flow *)uobject->object;
struct ib_uflow_object *uflow =
@@ -66,13 +69,15 @@ static int uverbs_free_flow(struct ib_uobject *uobject,
}
static int uverbs_free_mw(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
return uverbs_dealloc_mw((struct ib_mw *)uobject->object);
}
static int uverbs_free_qp(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_qp *qp = uobject->object;
struct ib_uqp_object *uqp =
@@ -93,19 +98,20 @@ static int uverbs_free_qp(struct ib_uobject *uobject,
ib_uverbs_detach_umcast(qp, uqp);
}
- ret = ib_destroy_qp(qp);
+ ret = ib_destroy_qp_user(qp, &attrs->driver_udata);
if (ib_is_destroy_retryable(ret, why, uobject))
return ret;
if (uqp->uxrcd)
atomic_dec(&uqp->uxrcd->refcnt);
- ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent);
+ ib_uverbs_release_uevent(attrs->ufile, &uqp->uevent);
return ret;
}
static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object;
struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
@@ -120,23 +126,25 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
}
static int uverbs_free_wq(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_wq *wq = uobject->object;
struct ib_uwq_object *uwq =
container_of(uobject, struct ib_uwq_object, uevent.uobject);
int ret;
- ret = ib_destroy_wq(wq);
+ ret = ib_destroy_wq(wq, &attrs->driver_udata);
if (ib_is_destroy_retryable(ret, why, uobject))
return ret;
- ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent);
+ ib_uverbs_release_uevent(attrs->ufile, &uwq->uevent);
return ret;
}
static int uverbs_free_srq(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_srq *srq = uobject->object;
struct ib_uevent_object *uevent =
@@ -144,7 +152,7 @@ static int uverbs_free_srq(struct ib_uobject *uobject,
enum ib_srq_type srq_type = srq->srq_type;
int ret;
- ret = ib_destroy_srq(srq);
+ ret = ib_destroy_srq_user(srq, &attrs->driver_udata);
if (ib_is_destroy_retryable(ret, why, uobject))
return ret;
@@ -155,12 +163,13 @@ static int uverbs_free_srq(struct ib_uobject *uobject,
atomic_dec(&us->uxrcd->refcnt);
}
- ib_uverbs_release_uevent(uobject->context->ufile, uevent);
+ ib_uverbs_release_uevent(attrs->ufile, uevent);
return ret;
}
static int uverbs_free_xrcd(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_xrcd *xrcd = uobject->object;
struct ib_uxrcd_object *uxrcd =
@@ -171,15 +180,16 @@ static int uverbs_free_xrcd(struct ib_uobject *uobject,
if (ret)
return ret;
- mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex);
- ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why);
- mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex);
+ mutex_lock(&attrs->ufile->device->xrcd_tree_mutex);
+ ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs);
+ mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex);
return ret;
}
static int uverbs_free_pd(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_pd *pd = uobject->object;
int ret;
@@ -188,7 +198,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject,
if (ret)
return ret;
- ib_dealloc_pd(pd);
+ ib_dealloc_pd_user(pd, &attrs->driver_udata);
return 0;
}
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
index 309c5e80988d..9f013304e677 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -31,11 +31,13 @@
* SOFTWARE.
*/
+#include "rdma_core.h"
#include "uverbs.h"
#include <rdma/uverbs_std_types.h>
static int uverbs_free_counters(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_counters *counters = uobject->object;
int ret;
@@ -52,7 +54,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE);
- struct ib_device *ib_dev = uobj->context->device;
+ struct ib_device *ib_dev = attrs->context->device;
struct ib_counters *counters;
int ret;
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index a59ea89e3f2b..db5c46a1bb2d 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -35,7 +35,8 @@
#include "uverbs.h"
static int uverbs_free_cq(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_cq *cq = uobject->object;
struct ib_uverbs_event_queue *ev_queue = cq->cq_context;
@@ -43,12 +44,12 @@ static int uverbs_free_cq(struct ib_uobject *uobject,
container_of(uobject, struct ib_ucq_object, uobject);
int ret;
- ret = ib_destroy_cq(cq);
+ ret = ib_destroy_cq_user(cq, &attrs->driver_udata);
if (ib_is_destroy_retryable(ret, why, uobject))
return ret;
ib_uverbs_release_ucq(
- uobject->context->ufile,
+ attrs->ufile,
ev_queue ? container_of(ev_queue,
struct ib_uverbs_completion_event_file,
ev_queue) :
@@ -63,7 +64,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
struct ib_ucq_object *obj = container_of(
uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
typeof(*obj), uobject);
- struct ib_device *ib_dev = obj->uobject.context->device;
+ struct ib_device *ib_dev = attrs->context->device;
int ret;
u64 user_handle;
struct ib_cq_init_attr attr = {};
@@ -110,8 +111,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
- cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context,
- &attrs->driver_udata);
+ cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_event_file;
diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c
index 2ef70637bee1..d5a1de33c2c9 100644
--- a/drivers/infiniband/core/uverbs_std_types_dm.c
+++ b/drivers/infiniband/core/uverbs_std_types_dm.c
@@ -30,11 +30,13 @@
* SOFTWARE.
*/
+#include "rdma_core.h"
#include "uverbs.h"
#include <rdma/uverbs_std_types.h>
static int uverbs_free_dm(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_dm *dm = uobject->object;
int ret;
@@ -43,7 +45,7 @@ static int uverbs_free_dm(struct ib_uobject *uobject,
if (ret)
return ret;
- return dm->device->ops.dealloc_dm(dm);
+ return dm->device->ops.dealloc_dm(dm, attrs);
}
static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
@@ -53,7 +55,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
struct ib_uobject *uobj =
uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)
->obj_attr.uobject;
- struct ib_device *ib_dev = uobj->context->device;
+ struct ib_device *ib_dev = attrs->context->device;
struct ib_dm *dm;
int ret;
@@ -70,7 +72,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
if (ret)
return ret;
- dm = ib_dev->ops.alloc_dm(ib_dev, uobj->context, &attr, attrs);
+ dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs);
if (IS_ERR(dm))
return PTR_ERR(dm);
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index 4962b87fa600..459cf165b231 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -30,11 +30,13 @@
* SOFTWARE.
*/
+#include "rdma_core.h"
#include "uverbs.h"
#include <rdma/uverbs_std_types.h>
static int uverbs_free_flow_action(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
struct ib_flow_action *action = uobject->object;
int ret;
@@ -308,7 +310,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
{
struct ib_uobject *uobj = uverbs_attr_get_uobject(
attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE);
- struct ib_device *ib_dev = uobj->context->device;
+ struct ib_device *ib_dev = attrs->context->device;
int ret;
struct ib_flow_action *action;
struct ib_flow_action_esp_attr esp_attr = {};
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 4d4be0c2b752..610d3b9f7654 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -30,13 +30,16 @@
* SOFTWARE.
*/
+#include "rdma_core.h"
#include "uverbs.h"
#include <rdma/uverbs_std_types.h>
static int uverbs_free_mr(struct ib_uobject *uobject,
- enum rdma_remove_reason why)
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
{
- return ib_dereg_mr((struct ib_mr *)uobject->object);
+ return ib_dereg_mr_user((struct ib_mr *)uobject->object,
+ &attrs->driver_udata);
}
static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)(
@@ -145,7 +148,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
return 0;
err_dereg:
- ib_dereg_mr(mr);
+ ib_dereg_mr_user(mr, &attrs->driver_udata);
return ret;
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 5a5e83f5f0fc..e666a1f7608d 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -218,6 +218,8 @@ rdma_node_get_transport(enum rdma_node_type node_type)
return RDMA_TRANSPORT_USNIC_UDP;
if (node_type == RDMA_NODE_RNIC)
return RDMA_TRANSPORT_IWARP;
+ if (node_type == RDMA_NODE_UNSPECIFIED)
+ return RDMA_TRANSPORT_UNSPECIFIED;
return RDMA_TRANSPORT_IB;
}
@@ -269,7 +271,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
pd->res.type = RDMA_RESTRACK_PD;
rdma_restrack_set_task(&pd->res, caller);
- ret = device->ops.alloc_pd(pd, NULL, NULL);
+ ret = device->ops.alloc_pd(pd, NULL);
if (ret) {
kfree(pd);
return ERR_PTR(ret);
@@ -316,17 +318,18 @@ EXPORT_SYMBOL(__ib_alloc_pd);
/**
* ib_dealloc_pd - Deallocates a protection domain.
* @pd: The protection domain to deallocate.
+ * @udata: Valid user data or NULL for kernel object
*
* It is an error to call this function while any resources in the pd still
* exist. The caller is responsible to synchronously destroy them and
* guarantee no new allocations will happen.
*/
-void ib_dealloc_pd(struct ib_pd *pd)
+void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
{
int ret;
if (pd->__internal_mr) {
- ret = pd->device->ops.dereg_mr(pd->__internal_mr);
+ ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL);
WARN_ON(ret);
pd->__internal_mr = NULL;
}
@@ -336,10 +339,10 @@ void ib_dealloc_pd(struct ib_pd *pd)
WARN_ON(atomic_read(&pd->usecnt));
rdma_restrack_del(&pd->res);
- pd->device->ops.dealloc_pd(pd);
+ pd->device->ops.dealloc_pd(pd, udata);
kfree(pd);
}
-EXPORT_SYMBOL(ib_dealloc_pd);
+EXPORT_SYMBOL(ib_dealloc_pd_user);
/* Address handles */
@@ -495,25 +498,33 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
u32 flags,
struct ib_udata *udata)
{
+ struct ib_device *device = pd->device;
struct ib_ah *ah;
+ int ret;
might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE);
- if (!pd->device->ops.create_ah)
+ if (!device->ops.create_ah)
return ERR_PTR(-EOPNOTSUPP);
- ah = pd->device->ops.create_ah(pd, ah_attr, flags, udata);
+ ah = rdma_zalloc_drv_obj_gfp(
+ device, ib_ah,
+ (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
- if (!IS_ERR(ah)) {
- ah->device = pd->device;
- ah->pd = pd;
- ah->uobject = NULL;
- ah->type = ah_attr->type;
- ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
+ ah->device = device;
+ ah->pd = pd;
+ ah->type = ah_attr->type;
+ ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
- atomic_inc(&pd->usecnt);
+ ret = device->ops.create_ah(ah, ah_attr, flags, udata);
+ if (ret) {
+ kfree(ah);
+ return ERR_PTR(ret);
}
+ atomic_inc(&pd->usecnt);
return ah;
}
@@ -930,25 +941,24 @@ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
}
EXPORT_SYMBOL(rdma_query_ah);
-int rdma_destroy_ah(struct ib_ah *ah, u32 flags)
+int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata)
{
const struct ib_gid_attr *sgid_attr = ah->sgid_attr;
struct ib_pd *pd;
- int ret;
might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE);
pd = ah->pd;
- ret = ah->device->ops.destroy_ah(ah, flags);
- if (!ret) {
- atomic_dec(&pd->usecnt);
- if (sgid_attr)
- rdma_put_gid_attr(sgid_attr);
- }
- return ret;
+ ah->device->ops.destroy_ah(ah, flags);
+ atomic_dec(&pd->usecnt);
+ if (sgid_attr)
+ rdma_put_gid_attr(sgid_attr);
+
+ kfree(ah);
+ return 0;
}
-EXPORT_SYMBOL(rdma_destroy_ah);
+EXPORT_SYMBOL(rdma_destroy_ah_user);
/* Shared receive queues */
@@ -956,29 +966,40 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd,
struct ib_srq_init_attr *srq_init_attr)
{
struct ib_srq *srq;
+ int ret;
if (!pd->device->ops.create_srq)
return ERR_PTR(-EOPNOTSUPP);
- srq = pd->device->ops.create_srq(pd, srq_init_attr, NULL);
-
- if (!IS_ERR(srq)) {
- srq->device = pd->device;
- srq->pd = pd;
- srq->uobject = NULL;
- srq->event_handler = srq_init_attr->event_handler;
- srq->srq_context = srq_init_attr->srq_context;
- srq->srq_type = srq_init_attr->srq_type;
- if (ib_srq_has_cq(srq->srq_type)) {
- srq->ext.cq = srq_init_attr->ext.cq;
- atomic_inc(&srq->ext.cq->usecnt);
- }
- if (srq->srq_type == IB_SRQT_XRC) {
- srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
- atomic_inc(&srq->ext.xrc.xrcd->usecnt);
- }
- atomic_inc(&pd->usecnt);
- atomic_set(&srq->usecnt, 0);
+ srq = rdma_zalloc_drv_obj(pd->device, ib_srq);
+ if (!srq)
+ return ERR_PTR(-ENOMEM);
+
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ srq->srq_type = srq_init_attr->srq_type;
+
+ if (ib_srq_has_cq(srq->srq_type)) {
+ srq->ext.cq = srq_init_attr->ext.cq;
+ atomic_inc(&srq->ext.cq->usecnt);
+ }
+ if (srq->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+ atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+ }
+ atomic_inc(&pd->usecnt);
+
+ ret = pd->device->ops.create_srq(srq, srq_init_attr, NULL);
+ if (ret) {
+ atomic_dec(&srq->pd->usecnt);
+ if (srq->srq_type == IB_SRQT_XRC)
+ atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+ if (ib_srq_has_cq(srq->srq_type))
+ atomic_dec(&srq->ext.cq->usecnt);
+ kfree(srq);
+ return ERR_PTR(ret);
}
return srq;
@@ -1003,36 +1024,23 @@ int ib_query_srq(struct ib_srq *srq,
}
EXPORT_SYMBOL(ib_query_srq);
-int ib_destroy_srq(struct ib_srq *srq)
+int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)
{
- struct ib_pd *pd;
- enum ib_srq_type srq_type;
- struct ib_xrcd *uninitialized_var(xrcd);
- struct ib_cq *uninitialized_var(cq);
- int ret;
-
if (atomic_read(&srq->usecnt))
return -EBUSY;
- pd = srq->pd;
- srq_type = srq->srq_type;
- if (ib_srq_has_cq(srq_type))
- cq = srq->ext.cq;
- if (srq_type == IB_SRQT_XRC)
- xrcd = srq->ext.xrc.xrcd;
+ srq->device->ops.destroy_srq(srq, udata);
- ret = srq->device->ops.destroy_srq(srq);
- if (!ret) {
- atomic_dec(&pd->usecnt);
- if (srq_type == IB_SRQT_XRC)
- atomic_dec(&xrcd->usecnt);
- if (ib_srq_has_cq(srq_type))
- atomic_dec(&cq->usecnt);
- }
+ atomic_dec(&srq->pd->usecnt);
+ if (srq->srq_type == IB_SRQT_XRC)
+ atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+ if (ib_srq_has_cq(srq->srq_type))
+ atomic_dec(&srq->ext.cq->usecnt);
+ kfree(srq);
- return ret;
+ return 0;
}
-EXPORT_SYMBOL(ib_destroy_srq);
+EXPORT_SYMBOL(ib_destroy_srq_user);
/* Queue pairs */
@@ -1111,8 +1119,9 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
}
EXPORT_SYMBOL(ib_open_qp);
-static struct ib_qp *create_xrc_qp(struct ib_qp *qp,
- struct ib_qp_init_attr *qp_init_attr)
+static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
+ struct ib_qp_init_attr *qp_init_attr,
+ struct ib_udata *udata)
{
struct ib_qp *real_qp = qp;
@@ -1134,8 +1143,9 @@ static struct ib_qp *create_xrc_qp(struct ib_qp *qp,
return qp;
}
-struct ib_qp *ib_create_qp(struct ib_pd *pd,
- struct ib_qp_init_attr *qp_init_attr)
+struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr,
+ struct ib_udata *udata)
{
struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
struct ib_qp *qp;
@@ -1164,7 +1174,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
if (ret)
goto err;
- qp->real_qp = qp;
qp->qp_type = qp_init_attr->qp_type;
qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
@@ -1176,7 +1185,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
qp->port = 0;
if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
- struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr);
+ struct ib_qp *xrc_qp =
+ create_xrc_qp_user(qp, qp_init_attr, udata);
if (IS_ERR(xrc_qp)) {
ret = PTR_ERR(xrc_qp);
@@ -1230,7 +1240,7 @@ err:
return ERR_PTR(ret);
}
-EXPORT_SYMBOL(ib_create_qp);
+EXPORT_SYMBOL(ib_create_qp_user);
static const struct {
int valid;
@@ -1837,7 +1847,7 @@ static int __ib_destroy_shared_qp(struct ib_qp *qp)
return 0;
}
-int ib_destroy_qp(struct ib_qp *qp)
+int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
{
const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;
const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr;
@@ -1869,7 +1879,7 @@ int ib_destroy_qp(struct ib_qp *qp)
rdma_rw_cleanup_mrs(qp);
rdma_restrack_del(&qp->res);
- ret = qp->device->ops.destroy_qp(qp);
+ ret = qp->device->ops.destroy_qp(qp, udata);
if (!ret) {
if (alt_path_sgid_attr)
rdma_put_gid_attr(alt_path_sgid_attr);
@@ -1894,7 +1904,7 @@ int ib_destroy_qp(struct ib_qp *qp)
return ret;
}
-EXPORT_SYMBOL(ib_destroy_qp);
+EXPORT_SYMBOL(ib_destroy_qp_user);
/* Completion queues */
@@ -1907,7 +1917,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
{
struct ib_cq *cq;
- cq = device->ops.create_cq(device, cq_attr, NULL, NULL);
+ cq = device->ops.create_cq(device, cq_attr, NULL);
if (!IS_ERR(cq)) {
cq->device = device;
@@ -1933,15 +1943,15 @@ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
}
EXPORT_SYMBOL(rdma_set_cq_moderation);
-int ib_destroy_cq(struct ib_cq *cq)
+int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
{
if (atomic_read(&cq->usecnt))
return -EBUSY;
rdma_restrack_del(&cq->res);
- return cq->device->ops.destroy_cq(cq);
+ return cq->device->ops.destroy_cq(cq, udata);
}
-EXPORT_SYMBOL(ib_destroy_cq);
+EXPORT_SYMBOL(ib_destroy_cq_user);
int ib_resize_cq(struct ib_cq *cq, int cqe)
{
@@ -1952,14 +1962,14 @@ EXPORT_SYMBOL(ib_resize_cq);
/* Memory regions */
-int ib_dereg_mr(struct ib_mr *mr)
+int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
{
struct ib_pd *pd = mr->pd;
struct ib_dm *dm = mr->dm;
int ret;
rdma_restrack_del(&mr->res);
- ret = mr->device->ops.dereg_mr(mr);
+ ret = mr->device->ops.dereg_mr(mr, udata);
if (!ret) {
atomic_dec(&pd->usecnt);
if (dm)
@@ -1968,13 +1978,14 @@ int ib_dereg_mr(struct ib_mr *mr)
return ret;
}
-EXPORT_SYMBOL(ib_dereg_mr);
+EXPORT_SYMBOL(ib_dereg_mr_user);
/**
* ib_alloc_mr() - Allocates a memory region
* @pd: protection domain associated with the region
* @mr_type: memory region type
* @max_num_sg: maximum sg entries available for registration.
+ * @udata: user data or null for kernel objects
*
* Notes:
* Memory registeration page/sg lists must not exceed max_num_sg.
@@ -1982,16 +1993,15 @@ EXPORT_SYMBOL(ib_dereg_mr);
* max_num_sg * used_page_size.
*
*/
-struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
- enum ib_mr_type mr_type,
- u32 max_num_sg)
+struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg, struct ib_udata *udata)
{
struct ib_mr *mr;
if (!pd->device->ops.alloc_mr)
return ERR_PTR(-EOPNOTSUPP);
- mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg);
+ mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);
if (!IS_ERR(mr)) {
mr->device = pd->device;
mr->pd = pd;
@@ -2005,7 +2015,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
return mr;
}
-EXPORT_SYMBOL(ib_alloc_mr);
+EXPORT_SYMBOL(ib_alloc_mr_user);
/* "Fast" memory regions */
@@ -2138,7 +2148,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
if (!device->ops.alloc_xrcd)
return ERR_PTR(-EOPNOTSUPP);
- xrcd = device->ops.alloc_xrcd(device, NULL, NULL);
+ xrcd = device->ops.alloc_xrcd(device, NULL);
if (!IS_ERR(xrcd)) {
xrcd->device = device;
xrcd->inode = NULL;
@@ -2151,7 +2161,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
}
EXPORT_SYMBOL(__ib_alloc_xrcd);
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
{
struct ib_qp *qp;
int ret;
@@ -2166,7 +2176,7 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
return ret;
}
- return xrcd->device->ops.dealloc_xrcd(xrcd);
+ return xrcd->device->ops.dealloc_xrcd(xrcd, udata);
}
EXPORT_SYMBOL(ib_dealloc_xrcd);
@@ -2210,10 +2220,11 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd,
EXPORT_SYMBOL(ib_create_wq);
/**
- * ib_destroy_wq - Destroys the specified WQ.
+ * ib_destroy_wq - Destroys the specified user WQ.
* @wq: The WQ to destroy.
+ * @udata: Valid user data
*/
-int ib_destroy_wq(struct ib_wq *wq)
+int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
{
int err;
struct ib_cq *cq = wq->cq;
@@ -2222,7 +2233,7 @@ int ib_destroy_wq(struct ib_wq *wq)
if (atomic_read(&wq->usecnt))
return -EBUSY;
- err = wq->device->ops.destroy_wq(wq);
+ err = wq->device->ops.destroy_wq(wq, udata);
if (!err) {
atomic_dec(&pd->usecnt);
atomic_dec(&cq->usecnt);
@@ -2701,3 +2712,37 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num,
netdev, params.param);
}
EXPORT_SYMBOL(rdma_init_netdev);
+
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+ struct scatterlist *sglist, unsigned int nents,
+ unsigned long pgsz)
+{
+ memset(biter, 0, sizeof(struct ib_block_iter));
+ biter->__sg = sglist;
+ biter->__sg_nents = nents;
+
+ /* Driver provides best block size to use */
+ biter->__pg_bit = __fls(pgsz);
+}
+EXPORT_SYMBOL(__rdma_block_iter_start);
+
+bool __rdma_block_iter_next(struct ib_block_iter *biter)
+{
+ unsigned int block_offset;
+
+ if (!biter->__sg_nents || !biter->__sg)
+ return false;
+
+ biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
+ block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
+ biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
+
+ if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
+ biter->__sg_advance = 0;
+ biter->__sg = sg_next(biter->__sg);
+ biter->__sg_nents--;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(__rdma_block_iter_next);