diff options
Diffstat (limited to 'drivers/infiniband/core/cache.c')
| -rw-r--r-- | drivers/infiniband/core/cache.c | 1731 |
1 files changed, 1059 insertions, 672 deletions
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index efc94304dee3..81cf3c902e81 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -33,7 +33,7 @@ * SOFTWARE. */ -#include <linux/module.h> +#include <linux/if_vlan.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> @@ -46,21 +46,18 @@ struct ib_pkey_cache { int table_len; - u16 table[0]; + u16 table[] __counted_by(table_len); }; struct ib_update_work { struct work_struct work; - struct ib_device *device; - u8 port_num; - bool enforce_security; + struct ib_event event; + bool enforce_security; }; union ib_gid zgid; EXPORT_SYMBOL(zgid); -static const struct ib_gid_attr zattr; - enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, @@ -68,29 +65,39 @@ enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; -enum gid_table_entry_props { - GID_TABLE_ENTRY_INVALID = 1UL << 0, - GID_TABLE_ENTRY_DEFAULT = 1UL << 1, +enum gid_table_entry_state { + GID_TABLE_ENTRY_INVALID = 1, + GID_TABLE_ENTRY_VALID = 2, + /* + * Indicates that entry is pending to be removed, there may + * be active users of this GID entry. + * When last user of the GID entry releases reference to it, + * GID entry is detached from the table. + */ + GID_TABLE_ENTRY_PENDING_DEL = 3, }; -enum gid_table_write_action { - GID_TABLE_WRITE_ACTION_ADD, - GID_TABLE_WRITE_ACTION_DEL, - /* MODIFY only updates the GID table. Currently only used by - * ib_cache_update. - */ - GID_TABLE_WRITE_ACTION_MODIFY +struct roce_gid_ndev_storage { + struct rcu_head rcu_head; + struct net_device *ndev; }; struct ib_gid_table_entry { - unsigned long props; - union ib_gid gid; - struct ib_gid_attr attr; - void *context; + struct kref kref; + struct work_struct del_work; + struct ib_gid_attr attr; + void *context; + /* Store the ndev pointer to release reference later on in + * call_rcu context because by that time gid_table_entry + * and attr might be already freed. So keep a copy of it. + * ndev_storage is freed by rcu callback. + */ + struct roce_gid_ndev_storage *ndev_storage; + enum gid_table_entry_state state; }; struct ib_gid_table { - int sz; + int sz; /* In RoCE, adding a GID to the table requires: * (a) Find if this GID is already exists. * (b) Find a free space. @@ -100,35 +107,37 @@ struct ib_gid_table { * (a) Find the GID * (b) Delete it. * - * Add/delete should be carried out atomically. - * This is done by locking this mutex from multiple - * writers. We don't need this lock for IB, as the MAD - * layer replaces all entries. All data_vec entries - * are locked by this lock. **/ - struct mutex lock; - /* This lock protects the table entries from being - * read and written simultaneously. + /* Any writer to data_vec must hold this lock and the write side of + * rwlock. Readers must hold only rwlock. All writers must be in a + * sleepable context. + */ + struct mutex lock; + /* rwlock protects data_vec[ix]->state and entry pointer. */ - rwlock_t rwlock; - struct ib_gid_table_entry *data_vec; + rwlock_t rwlock; + struct ib_gid_table_entry **data_vec; + /* bit field, each bit indicates the index of default GID */ + u32 default_gid_indices; }; -static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) { - if (rdma_cap_roce_gid_table(ib_dev, port)) { - struct ib_event event; + struct ib_event event; - event.device = ib_dev; - event.element.port_num = port; - event.event = IB_EVENT_GID_CHANGE; + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; - ib_dispatch_event(&event); - } + ib_dispatch_event_clients(&event); } static const char * const gid_type_str[] = { + /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for + * user space compatibility reasons. + */ [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; @@ -141,6 +150,29 @@ const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) } EXPORT_SYMBOL(ib_cache_gid_type_str); +/** rdma_is_zero_gid - Check if given GID is zero or not. + * @gid: GID to check + * Returns true if given GID is zero, returns false otherwise. + */ +bool rdma_is_zero_gid(const union ib_gid *gid) +{ + return !memcmp(gid, &zgid, sizeof(*gid)); +} +EXPORT_SYMBOL(rdma_is_zero_gid); + +/** is_gid_index_default - Check if a given index belongs to + * reserved default GIDs or not. + * @table: GID table pointer + * @index: Index to check in GID table + * Returns true if index is one of the reserved default GID index otherwise + * returns false. + */ +static bool is_gid_index_default(const struct ib_gid_table *table, + unsigned int index) +{ + return index < 32 && (BIT(index) & table->default_gid_indices); +} + int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; @@ -165,94 +197,272 @@ int ib_cache_gid_parse_type_str(const char *buf) } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); -/* This function expects that rwlock will be write locked in all - * scenarios and that lock will be locked in sleep-able (RoCE) - * scenarios. - */ -static int write_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - enum gid_table_write_action action, - bool default_gid) - __releases(&table->rwlock) __acquires(&table->rwlock) +static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) { - int ret = 0; - struct net_device *old_net_dev; - enum ib_gid_type old_gid_type; + return device->port_data[port].cache.gid; +} + +static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) +{ + return !entry; +} + +static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) +{ + return entry && entry->state == GID_TABLE_ENTRY_VALID; +} - /* in rdma_cap_roce_gid_table, this funciton should be protected by a - * sleep-able lock. +static void schedule_free_gid(struct kref *kref) +{ + struct ib_gid_table_entry *entry = + container_of(kref, struct ib_gid_table_entry, kref); + + queue_work(ib_wq, &entry->del_work); +} + +static void put_gid_ndev(struct rcu_head *head) +{ + struct roce_gid_ndev_storage *storage = + container_of(head, struct roce_gid_ndev_storage, rcu_head); + + WARN_ON(!storage->ndev); + /* At this point its safe to release netdev reference, + * as all callers working on gid_attr->ndev are done + * using this netdev. */ + dev_put(storage->ndev); + kfree(storage); +} - if (rdma_cap_roce_gid_table(ib_dev, port)) { - table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; - write_unlock_irq(&table->rwlock); - /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by - * RoCE providers and thus only updates the cache. - */ - if (action == GID_TABLE_WRITE_ACTION_ADD) - ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr, - &table->data_vec[ix].context); - else if (action == GID_TABLE_WRITE_ACTION_DEL) - ret = ib_dev->del_gid(ib_dev, port, ix, - &table->data_vec[ix].context); - write_lock_irq(&table->rwlock); - } +static void free_gid_entry_locked(struct ib_gid_table_entry *entry) +{ + struct ib_device *device = entry->attr.device; + u32 port_num = entry->attr.port_num; + struct ib_gid_table *table = rdma_gid_table(device, port_num); - old_net_dev = table->data_vec[ix].attr.ndev; - old_gid_type = table->data_vec[ix].attr.gid_type; - if (old_net_dev && old_net_dev != attr->ndev) - dev_put(old_net_dev); - /* if modify_gid failed, just delete the old gid */ - if (ret || action == GID_TABLE_WRITE_ACTION_DEL) { - gid = &zgid; - attr = &zattr; - table->data_vec[ix].context = NULL; - } + dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, + port_num, entry->attr.index, entry->attr.gid.raw); + + write_lock_irq(&table->rwlock); + + /* + * The only way to avoid overwriting NULL in table is + * by comparing if it is same entry in table or not! + * If new entry in table is added by the time we free here, + * don't overwrite the table entry. + */ + if (entry == table->data_vec[entry->attr.index]) + table->data_vec[entry->attr.index] = NULL; + /* Now this index is ready to be allocated */ + write_unlock_irq(&table->rwlock); + + if (entry->ndev_storage) + call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); + kfree(entry); +} + +static void free_gid_entry(struct kref *kref) +{ + struct ib_gid_table_entry *entry = + container_of(kref, struct ib_gid_table_entry, kref); + + free_gid_entry_locked(entry); +} - memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); - memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); - if (default_gid) { - table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; - if (action == GID_TABLE_WRITE_ACTION_DEL) - table->data_vec[ix].attr.gid_type = old_gid_type; +/** + * free_gid_work - Release reference to the GID entry + * @work: Work structure to refer to GID entry which needs to be + * deleted. + * + * free_gid_work() frees the entry from the HCA's hardware table + * if provider supports it. It releases reference to netdevice. + */ +static void free_gid_work(struct work_struct *work) +{ + struct ib_gid_table_entry *entry = + container_of(work, struct ib_gid_table_entry, del_work); + struct ib_device *device = entry->attr.device; + u32 port_num = entry->attr.port_num; + struct ib_gid_table *table = rdma_gid_table(device, port_num); + + mutex_lock(&table->lock); + free_gid_entry_locked(entry); + mutex_unlock(&table->lock); +} + +static struct ib_gid_table_entry * +alloc_gid_entry(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry; + struct net_device *ndev; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + ndev = rcu_dereference_protected(attr->ndev, 1); + if (ndev) { + entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), + GFP_KERNEL); + if (!entry->ndev_storage) { + kfree(entry); + return NULL; + } + dev_hold(ndev); + entry->ndev_storage->ndev = ndev; } - if (table->data_vec[ix].attr.ndev && - table->data_vec[ix].attr.ndev != old_net_dev) - dev_hold(table->data_vec[ix].attr.ndev); + kref_init(&entry->kref); + memcpy(&entry->attr, attr, sizeof(*attr)); + INIT_WORK(&entry->del_work, free_gid_work); + entry->state = GID_TABLE_ENTRY_INVALID; + return entry; +} + +static void store_gid_entry(struct ib_gid_table *table, + struct ib_gid_table_entry *entry) +{ + entry->state = GID_TABLE_ENTRY_VALID; - table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; + dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", + __func__, entry->attr.port_num, entry->attr.index, + entry->attr.gid.raw); - return ret; + lockdep_assert_held(&table->lock); + write_lock_irq(&table->rwlock); + table->data_vec[entry->attr.index] = entry; + write_unlock_irq(&table->rwlock); } -static int add_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - bool default_gid) { - return write_gid(ib_dev, port, table, ix, gid, attr, - GID_TABLE_WRITE_ACTION_ADD, default_gid); +static void get_gid_entry(struct ib_gid_table_entry *entry) +{ + kref_get(&entry->kref); } -static int modify_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - const union ib_gid *gid, - const struct ib_gid_attr *attr, - bool default_gid) { - return write_gid(ib_dev, port, table, ix, gid, attr, - GID_TABLE_WRITE_ACTION_MODIFY, default_gid); +static void put_gid_entry(struct ib_gid_table_entry *entry) +{ + kref_put(&entry->kref, schedule_free_gid); +} + +static void put_gid_entry_locked(struct ib_gid_table_entry *entry) +{ + kref_put(&entry->kref, free_gid_entry); } -static int del_gid(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table, int ix, - bool default_gid) { - return write_gid(ib_dev, port, table, ix, &zgid, &zattr, - GID_TABLE_WRITE_ACTION_DEL, default_gid); +static int add_roce_gid(struct ib_gid_table_entry *entry) +{ + const struct ib_gid_attr *attr = &entry->attr; + int ret; + + if (!attr->ndev) { + dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", + __func__, attr->port_num, attr->index); + return -EINVAL; + } + if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { + ret = attr->device->ops.add_gid(attr, &entry->context); + if (ret) { + dev_err(&attr->device->dev, + "%s GID add failed port=%u index=%u\n", + __func__, attr->port_num, attr->index); + return ret; + } + } + return 0; } -/* rwlock should be read locked */ +/** + * del_gid - Delete GID table entry + * + * @ib_dev: IB device whose GID entry to be deleted + * @port: Port number of the IB device + * @table: GID table of the IB device for a port + * @ix: GID entry index to delete + * + */ +static void del_gid(struct ib_device *ib_dev, u32 port, + struct ib_gid_table *table, int ix) +{ + struct roce_gid_ndev_storage *ndev_storage; + struct ib_gid_table_entry *entry; + + lockdep_assert_held(&table->lock); + + dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, + ix, table->data_vec[ix]->attr.gid.raw); + + write_lock_irq(&table->rwlock); + entry = table->data_vec[ix]; + entry->state = GID_TABLE_ENTRY_PENDING_DEL; + /* + * For non RoCE protocol, GID entry slot is ready to use. + */ + if (!rdma_protocol_roce(ib_dev, port)) + table->data_vec[ix] = NULL; + write_unlock_irq(&table->rwlock); + + if (rdma_cap_roce_gid_table(ib_dev, port)) + ib_dev->ops.del_gid(&entry->attr, &entry->context); + + ndev_storage = entry->ndev_storage; + if (ndev_storage) { + entry->ndev_storage = NULL; + rcu_assign_pointer(entry->attr.ndev, NULL); + call_rcu(&ndev_storage->rcu_head, put_gid_ndev); + } + + put_gid_entry_locked(entry); +} + +/** + * add_modify_gid - Add or modify GID table entry + * + * @table: GID table in which GID to be added or modified + * @attr: Attributes of the GID + * + * Returns 0 on success or appropriate error code. It accepts zero + * GID addition for non RoCE ports for HCA's who report them as valid + * GID. However such zero GIDs are not added to the cache. + */ +static int add_modify_gid(struct ib_gid_table *table, + const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry; + int ret = 0; + + /* + * Invalidate any old entry in the table to make it safe to write to + * this index. + */ + if (is_gid_entry_valid(table->data_vec[attr->index])) + del_gid(attr->device, attr->port_num, table, attr->index); + + /* + * Some HCA's report multiple GID entries with only one valid GID, and + * leave other unused entries as the zero GID. Convert zero GIDs to + * empty table entries instead of storing them. + */ + if (rdma_is_zero_gid(&attr->gid)) + return 0; + + entry = alloc_gid_entry(attr); + if (!entry) + return -ENOMEM; + + if (rdma_protocol_roce(attr->device, attr->port_num)) { + ret = add_roce_gid(entry); + if (ret) + goto done; + } + + store_gid_entry(table, entry); + return 0; + +done: + put_gid_entry(entry); + return ret; +} + +/* rwlock should be read locked, or lock should be held */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask, int *pempty) @@ -262,30 +472,52 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { - struct ib_gid_table_entry *data = &table->data_vec[i]; - struct ib_gid_attr *attr = &data->attr; + struct ib_gid_table_entry *data = table->data_vec[i]; + struct ib_gid_attr *attr; int curr_index = i; i++; - if (data->props & GID_TABLE_ENTRY_INVALID) - continue; - - if (empty < 0) - if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && - !memcmp(attr, &zattr, sizeof(*attr)) && - !data->props) + /* find_gid() is used during GID addition where it is expected + * to return a free entry slot which is not duplicate. + * Free entry slot is requested and returned if pempty is set, + * so lookup free slot only if requested. + */ + if (pempty && empty < 0) { + if (is_gid_entry_free(data) && + default_gid == + is_gid_index_default(table, curr_index)) { + /* + * Found an invalid (free) entry; allocate it. + * If default GID is requested, then our + * found slot must be one of the DEFAULT + * reserved slots or we fail. + * This ensures that only DEFAULT reserved + * slots are used for default property GIDs. + */ empty = curr_index; + } + } + + /* + * Additionally find_gid() is used to find valid entry during + * lookup operation; so ignore the entries which are marked as + * pending for removal and the entries which are marked as + * invalid. + */ + if (!is_gid_entry_valid(data)) + continue; if (found >= 0) continue; + attr = &data->attr; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && - memcmp(gid, &data->gid, sizeof(*gid))) + memcmp(gid, &data->attr.gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && @@ -293,8 +525,7 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && - !!(data->props & GID_TABLE_ENTRY_DEFAULT) != - default_gid) + is_gid_index_default(table, curr_index) != default_gid) continue; found = curr_index; @@ -312,42 +543,27 @@ static void make_default_gid(struct net_device *dev, union ib_gid *gid) addrconf_ifid_eui48(&gid->raw[8], dev); } -int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, - union ib_gid *gid, struct ib_gid_attr *attr) +static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr, + unsigned long mask, bool default_gid) { struct ib_gid_table *table; - int ix; int ret = 0; - struct net_device *idev; int empty; + int ix; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; - - if (!memcmp(gid, &zgid, sizeof(*gid))) + /* Do not allow adding zero GID in support of + * IB spec version 1.3 section 4.1.1 point (6) and + * section 12.7.10 and section 12.7.20 + */ + if (rdma_is_zero_gid(gid)) return -EINVAL; - if (ib_dev->get_netdev) { - idev = ib_dev->get_netdev(ib_dev, port); - if (idev && attr->ndev != idev) { - union ib_gid default_gid; - - /* Adding default GIDs in not permitted */ - make_default_gid(idev, &default_gid); - if (!memcmp(gid, &default_gid, sizeof(*gid))) { - dev_put(idev); - return -EPERM; - } - } - if (idev) - dev_put(idev); - } + table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); - ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_NETDEV, &empty); + ix = find_gid(table, gid, attr, default_gid, mask, &empty); if (ix >= 0) goto out_unlock; @@ -355,66 +571,92 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, ret = -ENOSPC; goto out_unlock; } - - ret = add_gid(ib_dev, port, table, empty, gid, attr, false); + attr->device = ib_dev; + attr->index = empty; + attr->port_num = port; + attr->gid = *gid; + ret = add_modify_gid(table, attr); if (!ret) dispatch_gid_change_event(ib_dev, port); out_unlock: - write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); + if (ret) + pr_warn_ratelimited("%s: unable to add gid %pI6 error=%d\n", + __func__, gid->raw, ret); return ret; } -int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV; + + return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); +} + +static int +_ib_cache_gid_del(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr, + unsigned long mask, bool default_gid) +{ struct ib_gid_table *table; + int ret = 0; int ix; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); - ix = find_gid(table, gid, attr, false, - GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_NETDEV | - GID_ATTR_FIND_MASK_DEFAULT, - NULL); - if (ix < 0) + ix = find_gid(table, gid, attr, default_gid, mask, NULL); + if (ix < 0) { + ret = -EINVAL; goto out_unlock; + } - if (!del_gid(ib_dev, port, table, ix, false)) - dispatch_gid_change_event(ib_dev, port); + del_gid(ib_dev, port, table, ix); + dispatch_gid_change_event(ib_dev, port); out_unlock: - write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); - return 0; + if (ret) + pr_debug("%s: can't delete gid %pI6 error=%d\n", + __func__, gid->raw, ret); + return ret; } -int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT | + GID_ATTR_FIND_MASK_NETDEV; + + return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); +} + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct ib_gid_table *table; int ix; bool deleted = false; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); - for (ix = 0; ix < table->sz; ix++) - if (table->data_vec[ix].attr.ndev == ndev) - if (!del_gid(ib_dev, port, table, ix, - !!(table->data_vec[ix].props & - GID_TABLE_ENTRY_DEFAULT))) - deleted = true; + for (ix = 0; ix < table->sz; ix++) { + if (is_gid_entry_valid(table->data_vec[ix]) && + table->data_vec[ix]->attr.ndev == ndev) { + del_gid(ib_dev, port, table, ix); + deleted = true; + } + } - write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); if (deleted) @@ -423,92 +665,38 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, return 0; } -static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, - union ib_gid *gid, struct ib_gid_attr *attr) -{ - struct ib_gid_table *table; - - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; - - if (index < 0 || index >= table->sz) - return -EINVAL; - - if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) - return -EAGAIN; - - memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); - if (attr) { - memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); - if (attr->ndev) - dev_hold(attr->ndev); - } - - return 0; -} - -static int _ib_cache_gid_table_find(struct ib_device *ib_dev, - const union ib_gid *gid, - const struct ib_gid_attr *val, - unsigned long mask, - u8 *port, u16 *index) -{ - struct ib_gid_table *table; - u8 p; - int local_index; - unsigned long flags; - - for (p = 0; p < ib_dev->phys_port_cnt; p++) { - table = ib_dev->cache.ports[p].gid; - read_lock_irqsave(&table->rwlock, flags); - local_index = find_gid(table, gid, val, false, mask, NULL); - if (local_index >= 0) { - if (index) - *index = local_index; - if (port) - *port = p + rdma_start_port(ib_dev); - read_unlock_irqrestore(&table->rwlock, flags); - return 0; - } - read_unlock_irqrestore(&table->rwlock, flags); - } - - return -ENOENT; -} - -static int ib_cache_gid_find(struct ib_device *ib_dev, - const union ib_gid *gid, - enum ib_gid_type gid_type, - struct net_device *ndev, u8 *port, - u16 *index) -{ - unsigned long mask = GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_GID_TYPE; - struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; - - if (ndev) - mask |= GID_ATTR_FIND_MASK_NETDEV; - - return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, - mask, port, index); -} - -int ib_find_cached_gid_by_port(struct ib_device *ib_dev, - const union ib_gid *gid, - enum ib_gid_type gid_type, - u8 port, struct net_device *ndev, - u16 *index) +/** + * rdma_find_gid_by_port - Returns the GID entry attributes when it finds + * a valid GID entry for given search parameters. It searches for the specified + * GID value in the local software cache. + * @ib_dev: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @port: The port number of the device where the GID value should be searched. + * @ndev: In RoCE, the net device of the device. NULL means ignore. + * + * Returns sgid attributes if the GID is found with valid reference or + * returns ERR_PTR for the error. + * The caller must invoke rdma_put_gid_attr() to release the reference. + */ +const struct ib_gid_attr * +rdma_find_gid_by_port(struct ib_device *ib_dev, + const union ib_gid *gid, + enum ib_gid_type gid_type, + u32 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; + const struct ib_gid_attr *attr; unsigned long flags; if (!rdma_is_port_valid(ib_dev, port)) - return -ENOENT; + return ERR_PTR(-ENOENT); - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; @@ -516,91 +704,74 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { - if (index) - *index = local_index; + get_gid_entry(table->data_vec[local_index]); + attr = &table->data_vec[local_index]->attr; read_unlock_irqrestore(&table->rwlock, flags); - return 0; + return attr; } read_unlock_irqrestore(&table->rwlock, flags); - return -ENOENT; + return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL(ib_find_cached_gid_by_port); +EXPORT_SYMBOL(rdma_find_gid_by_port); /** - * ib_find_gid_by_filter - Returns the GID table index where a specified - * GID value occurs - * @device: The device to query. + * rdma_find_gid_by_filter - Returns the GID table attribute where a + * specified GID value occurs + * @ib_dev: The device to query. * @gid: The GID value to search for. - * @port_num: The port number of the device where the GID value could be + * @port: The port number of the device where the GID value could be * searched. * @filter: The filter function is executed on any matching GID in the table. * If the filter function returns true, the corresponding index is returned, * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. - * @index: The index into the cached GID table where the GID was found. This - * parameter may be NULL. + * @context: Private data to pass into the call-back. * - * ib_cache_gid_find_by_filter() searches for the specified GID value + * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. - * This function is only supported on RoCE ports. * */ -static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, - const union ib_gid *gid, - u8 port, - bool (*filter)(const union ib_gid *, - const struct ib_gid_attr *, - void *), - void *context, - u16 *index) +const struct ib_gid_attr *rdma_find_gid_by_filter( + struct ib_device *ib_dev, const union ib_gid *gid, u32 port, + bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, + void *), + void *context) { + const struct ib_gid_attr *res = ERR_PTR(-ENOENT); struct ib_gid_table *table; - unsigned int i; unsigned long flags; - bool found = false; - + unsigned int i; - if (!rdma_is_port_valid(ib_dev, port) || - !rdma_protocol_roce(ib_dev, port)) - return -EPROTONOSUPPORT; + if (!rdma_is_port_valid(ib_dev, port)) + return ERR_PTR(-EINVAL); - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { - struct ib_gid_attr attr; - - if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) - goto next; + struct ib_gid_table_entry *entry = table->data_vec[i]; - if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) - goto next; - - memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); + if (!is_gid_entry_valid(entry)) + continue; - if (filter(gid, &attr, context)) - found = true; + if (memcmp(gid, &entry->attr.gid, sizeof(*gid))) + continue; -next: - if (found) + if (filter(gid, &entry->attr, context)) { + get_gid_entry(entry); + res = &entry->attr; break; + } } read_unlock_irqrestore(&table->rwlock, flags); - - if (!found) - return -ENOENT; - - if (index) - *index = i; - return 0; + return res; } static struct ib_gid_table *alloc_gid_table(int sz) { - struct ib_gid_table *table = - kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; @@ -613,7 +784,6 @@ static struct ib_gid_table *alloc_gid_table(int sz) table->sz = sz; rwlock_init(&table->rwlock); - return table; err_free_table: @@ -621,198 +791,131 @@ err_free_table: return NULL; } -static void release_gid_table(struct ib_gid_table *table) +static void release_gid_table(struct ib_device *device, + struct ib_gid_table *table) { - if (table) { - kfree(table->data_vec); - kfree(table); + int i; + + if (!table) + return; + + for (i = 0; i < table->sz; i++) { + if (is_gid_entry_free(table->data_vec[i])) + continue; + + WARN_ONCE(true, + "GID entry ref leak for dev %s index %d ref=%u\n", + dev_name(&device->dev), i, + kref_read(&table->data_vec[i]->kref)); } + + mutex_destroy(&table->lock); + kfree(table->data_vec); + kfree(table); } -static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, +static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { int i; - bool deleted = false; if (!table) return; - write_lock_irq(&table->rwlock); + mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { - if (memcmp(&table->data_vec[i].gid, &zgid, - sizeof(table->data_vec[i].gid))) - if (!del_gid(ib_dev, port, table, i, - table->data_vec[i].props & - GID_ATTR_FIND_MASK_DEFAULT)) - deleted = true; + if (is_gid_entry_valid(table->data_vec[i])) + del_gid(ib_dev, port, table, i); } - write_unlock_irq(&table->rwlock); - - if (deleted) - dispatch_gid_change_event(ib_dev, port); + mutex_unlock(&table->lock); } -void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { - union ib_gid gid; + union ib_gid gid = { }; struct ib_gid_attr gid_attr; - struct ib_gid_attr zattr_type = zattr; - struct ib_gid_table *table; unsigned int gid_type; + unsigned long mask; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; - - make_default_gid(ndev, &gid); + mask = GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT | + GID_ATTR_FIND_MASK_NETDEV; memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { - int ix; - union ib_gid current_gid; - struct ib_gid_attr current_gid_attr = {}; - if (1UL << gid_type & ~gid_type_mask) continue; gid_attr.gid_type = gid_type; - mutex_lock(&table->lock); - write_lock_irq(&table->rwlock); - ix = find_gid(table, NULL, &gid_attr, true, - GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_DEFAULT, - NULL); - - /* Coudn't find default GID location */ - if (WARN_ON(ix < 0)) - goto release; - - zattr_type.gid_type = gid_type; - - if (!__ib_cache_gid_get(ib_dev, port, ix, - ¤t_gid, ¤t_gid_attr) && - mode == IB_CACHE_GID_DEFAULT_MODE_SET && - !memcmp(&gid, ¤t_gid, sizeof(gid)) && - !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) - goto release; - - if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || - memcmp(¤t_gid_attr, &zattr_type, - sizeof(current_gid_attr))) { - if (del_gid(ib_dev, port, table, ix, true)) { - pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", - ix, gid.raw); - goto release; - } else { - dispatch_gid_change_event(ib_dev, port); - } - } - if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { - if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) - pr_warn("ib_cache_gid: unable to add default gid %pI6\n", - gid.raw); - else - dispatch_gid_change_event(ib_dev, port); + make_default_gid(ndev, &gid); + __ib_cache_gid_add(ib_dev, port, &gid, + &gid_attr, mask, true); + } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { + _ib_cache_gid_del(ib_dev, port, &gid, + &gid_attr, mask, true); } - -release: - if (current_gid_attr.ndev) - dev_put(current_gid_attr.ndev); - write_unlock_irq(&table->rwlock); - mutex_unlock(&table->lock); } } -static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table) +static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, + struct ib_gid_table *table) { unsigned int i; unsigned long roce_gid_type_mask; unsigned int num_default_gids; - unsigned int current_gid = 0; roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); - for (i = 0; i < num_default_gids && i < table->sz; i++) { - struct ib_gid_table_entry *entry = - &table->data_vec[i]; - - entry->props |= GID_TABLE_ENTRY_DEFAULT; - current_gid = find_next_bit(&roce_gid_type_mask, - BITS_PER_LONG, - current_gid); - entry->attr.gid_type = current_gid++; - } + /* Reserve starting indices for default GIDs */ + for (i = 0; i < num_default_gids && i < table->sz; i++) + table->default_gid_indices |= BIT(i); +} - return 0; + +static void gid_table_release_one(struct ib_device *ib_dev) +{ + u32 p; + + rdma_for_each_port (ib_dev, p) { + release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); + ib_dev->port_data[p].cache.gid = NULL; + } } static int _gid_table_setup_one(struct ib_device *ib_dev) { - u8 port; struct ib_gid_table *table; - int err = 0; - - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - u8 rdma_port = port + rdma_start_port(ib_dev); + u32 rdma_port; - table = - alloc_gid_table( - ib_dev->port_immutable[rdma_port].gid_tbl_len); - if (!table) { - err = -ENOMEM; + rdma_for_each_port (ib_dev, rdma_port) { + table = alloc_gid_table( + ib_dev->port_data[rdma_port].immutable.gid_tbl_len); + if (!table) goto rollback_table_setup; - } - err = gid_table_reserve_default(ib_dev, - port + rdma_start_port(ib_dev), - table); - if (err) - goto rollback_table_setup; - ib_dev->cache.ports[port].gid = table; + gid_table_reserve_default(ib_dev, rdma_port, table); + ib_dev->port_data[rdma_port].cache.gid = table; } - return 0; rollback_table_setup: - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - - cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table); - release_gid_table(table); - } - - return err; -} - -static void gid_table_release_one(struct ib_device *ib_dev) -{ - struct ib_gid_table *table; - u8 port; - - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - release_gid_table(table); - ib_dev->cache.ports[port].gid = NULL; - } + gid_table_release_one(ib_dev); + return -ENOMEM; } static void gid_table_cleanup_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + u32 p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table); - } + rdma_for_each_port (ib_dev, p) + cleanup_gid_table_port(ib_dev, p, + ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) @@ -824,69 +927,125 @@ static int gid_table_setup_one(struct ib_device *ib_dev) if (err) return err; - err = roce_rescan_device(ib_dev); - - if (err) { - gid_table_cleanup_one(ib_dev); - gid_table_release_one(ib_dev); - } + rdma_roce_rescan_device(ib_dev); return err; } -int ib_get_cached_gid(struct ib_device *device, - u8 port_num, - int index, - union ib_gid *gid, - struct ib_gid_attr *gid_attr) +/** + * rdma_query_gid - Read the GID content from the GID software cache + * @device: Device to query the GID + * @port_num: Port number of the device + * @index: Index of the GID table entry to read + * @gid: Pointer to GID where to store the entry's GID + * + * rdma_query_gid() only reads the GID entry content for requested device, + * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't + * hold any reference to the GID table entry in the HCA or software cache. + * + * Returns 0 on success or appropriate error code. + * + */ +int rdma_query_gid(struct ib_device *device, u32 port_num, + int index, union ib_gid *gid) { - int res; - unsigned long flags; struct ib_gid_table *table; + unsigned long flags; + int res; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - table = device->cache.ports[port_num - rdma_start_port(device)].gid; + table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); - res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); - read_unlock_irqrestore(&table->rwlock, flags); + if (index < 0 || index >= table->sz) { + res = -EINVAL; + goto done; + } + + if (!is_gid_entry_valid(table->data_vec[index])) { + res = -ENOENT; + goto done; + } + + memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); + res = 0; + +done: + read_unlock_irqrestore(&table->rwlock, flags); return res; } -EXPORT_SYMBOL(ib_get_cached_gid); - -int ib_find_cached_gid(struct ib_device *device, - const union ib_gid *gid, - enum ib_gid_type gid_type, - struct net_device *ndev, - u8 *port_num, - u16 *index) +EXPORT_SYMBOL(rdma_query_gid); + +/** + * rdma_read_gid_hw_context - Read the HW GID context from GID attribute + * @attr: Potinter to the GID attribute + * + * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding + * to the SGID attr. Callers are required to already be holding the reference + * to an existing GID entry. + * + * Returns the HW GID context + * + */ +void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) { - return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); + return container_of(attr, struct ib_gid_table_entry, attr)->context; } -EXPORT_SYMBOL(ib_find_cached_gid); - -int ib_find_gid_by_filter(struct ib_device *device, - const union ib_gid *gid, - u8 port_num, - bool (*filter)(const union ib_gid *gid, - const struct ib_gid_attr *, - void *), - void *context, u16 *index) +EXPORT_SYMBOL(rdma_read_gid_hw_context); + +/** + * rdma_find_gid - Returns SGID attributes if the matching GID is found. + * @device: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @ndev: In RoCE, the net device of the device. NULL means ignore. + * + * rdma_find_gid() searches for the specified GID value in the software cache. + * + * Returns GID attributes if a valid GID is found or returns ERR_PTR for the + * error. The caller must invoke rdma_put_gid_attr() to release the reference. + * + */ +const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev) { - /* Only RoCE GID table supports filter function */ - if (!rdma_cap_roce_gid_table(device, port_num) && filter) - return -EPROTONOSUPPORT; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; + u32 p; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + rdma_for_each_port(device, p) { + struct ib_gid_table *table; + unsigned long flags; + int index; + + table = device->port_data[p].cache.gid; + read_lock_irqsave(&table->rwlock, flags); + index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); + if (index >= 0) { + const struct ib_gid_attr *attr; + + get_gid_entry(table->data_vec[index]); + attr = &table->data_vec[index]->attr; + read_unlock_irqrestore(&table->rwlock, flags); + return attr; + } + read_unlock_irqrestore(&table->rwlock, flags); + } - return ib_cache_gid_find_by_filter(device, gid, - port_num, filter, - context, index); + return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL(ib_find_gid_by_filter); +EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, - u8 port_num, + u32 port_num, int index, u16 *pkey) { @@ -897,45 +1056,34 @@ int ib_get_cached_pkey(struct ib_device *device, if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; - if (index < 0 || index >= cache->table_len) + if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; - read_unlock_irqrestore(&device->cache.lock, flags); + read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); -int ib_get_cached_subnet_prefix(struct ib_device *device, - u8 port_num, - u64 *sn_pfx) +void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, + u64 *sn_pfx) { unsigned long flags; - int p; - - if (port_num < rdma_start_port(device) || - port_num > rdma_end_port(device)) - return -EINVAL; - - p = port_num - rdma_start_port(device); - read_lock_irqsave(&device->cache.lock, flags); - *sn_pfx = device->cache.ports[p].subnet_prefix; - read_unlock_irqrestore(&device->cache.lock, flags); - return 0; + read_lock_irqsave(&device->cache_lock, flags); + *sn_pfx = device->port_data[port_num].cache.subnet_prefix; + read_unlock_irqrestore(&device->cache_lock, flags); } EXPORT_SYMBOL(ib_get_cached_subnet_prefix); -int ib_find_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) +int ib_find_cached_pkey(struct ib_device *device, u32 port_num, + u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; @@ -946,9 +1094,13 @@ int ib_find_cached_pkey(struct ib_device *device, if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; + if (!cache) { + ret = -EINVAL; + goto err; + } *index = -1; @@ -958,8 +1110,9 @@ int ib_find_cached_pkey(struct ib_device *device, *index = i; ret = 0; break; - } else + } else { partial_ix = i; + } } if (ret && partial_ix >= 0) { @@ -967,267 +1120,509 @@ int ib_find_cached_pkey(struct ib_device *device, ret = 0; } - read_unlock_irqrestore(&device->cache.lock, flags); +err: + read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); -int ib_find_exact_cached_pkey(struct ib_device *device, - u8 port_num, - u16 pkey, - u16 *index) +int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { - struct ib_pkey_cache *cache; unsigned long flags; - int i; - int ret = -ENOENT; + int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); + *lmc = device->port_data[port_num].cache.lmc; + read_unlock_irqrestore(&device->cache_lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + return ret; +} +EXPORT_SYMBOL(ib_get_cached_lmc); - *index = -1; +int ib_get_cached_port_state(struct ib_device *device, u32 port_num, + enum ib_port_state *port_state) +{ + unsigned long flags; + int ret = 0; - for (i = 0; i < cache->table_len; ++i) - if (cache->table[i] == pkey) { - *index = i; - ret = 0; - break; - } + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; - read_unlock_irqrestore(&device->cache.lock, flags); + read_lock_irqsave(&device->cache_lock, flags); + *port_state = device->port_data[port_num].cache.port_state; + read_unlock_irqrestore(&device->cache_lock, flags); return ret; } -EXPORT_SYMBOL(ib_find_exact_cached_pkey); +EXPORT_SYMBOL(ib_get_cached_port_state); -int ib_get_cached_lmc(struct ib_device *device, - u8 port_num, - u8 *lmc) +/** + * rdma_get_gid_attr - Returns GID attributes for a port of a device + * at a requested gid_index, if a valid GID entry exists. + * @device: The device to query. + * @port_num: The port number on the device where the GID value + * is to be queried. + * @index: Index of the GID table entry whose attributes are to + * be queried. + * + * rdma_get_gid_attr() acquires reference count of gid attributes from the + * cached GID table. Caller must invoke rdma_put_gid_attr() to release + * reference to gid attribute regardless of link layer. + * + * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error + * code. + */ +const struct ib_gid_attr * +rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) { + const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); + struct ib_gid_table *table; unsigned long flags; - int ret = 0; if (!rdma_is_port_valid(device, port_num)) - return -EINVAL; + return ERR_PTR(-EINVAL); + + table = rdma_gid_table(device, port_num); + if (index < 0 || index >= table->sz) + return ERR_PTR(-EINVAL); - read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc; - read_unlock_irqrestore(&device->cache.lock, flags); + read_lock_irqsave(&table->rwlock, flags); + if (!is_gid_entry_valid(table->data_vec[index])) + goto done; + + get_gid_entry(table->data_vec[index]); + attr = &table->data_vec[index]->attr; +done: + read_unlock_irqrestore(&table->rwlock, flags); + return attr; +} +EXPORT_SYMBOL(rdma_get_gid_attr); +/** + * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. + * @device: The device to query. + * @entries: Entries where GID entries are returned. + * @max_entries: Maximum number of entries that can be returned. + * Entries array must be allocated to hold max_entries number of entries. + * + * Returns number of entries on success or appropriate error code. + */ +ssize_t rdma_query_gid_table(struct ib_device *device, + struct ib_uverbs_gid_entry *entries, + size_t max_entries) +{ + const struct ib_gid_attr *gid_attr; + ssize_t num_entries = 0, ret; + struct ib_gid_table *table; + u32 port_num, i; + struct net_device *ndev; + unsigned long flags; + + rdma_for_each_port(device, port_num) { + table = rdma_gid_table(device, port_num); + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + if (!is_gid_entry_valid(table->data_vec[i])) + continue; + if (num_entries >= max_entries) { + ret = -EINVAL; + goto err; + } + + gid_attr = &table->data_vec[i]->attr; + + memcpy(&entries->gid, &gid_attr->gid, + sizeof(gid_attr->gid)); + entries->gid_index = gid_attr->index; + entries->port_num = gid_attr->port_num; + entries->gid_type = gid_attr->gid_type; + ndev = rcu_dereference_protected( + gid_attr->ndev, + lockdep_is_held(&table->rwlock)); + if (ndev) + entries->netdev_ifindex = ndev->ifindex; + + num_entries++; + entries++; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return num_entries; +err: + read_unlock_irqrestore(&table->rwlock, flags); return ret; } -EXPORT_SYMBOL(ib_get_cached_lmc); +EXPORT_SYMBOL(rdma_query_gid_table); -int ib_get_cached_port_state(struct ib_device *device, - u8 port_num, - enum ib_port_state *port_state) +/** + * rdma_put_gid_attr - Release reference to the GID attribute + * @attr: Pointer to the GID attribute whose reference + * needs to be released. + * + * rdma_put_gid_attr() must be used to release reference whose + * reference is acquired using rdma_get_gid_attr() or any APIs + * which returns a pointer to the ib_gid_attr regardless of link layer + * of IB or RoCE. + * + */ +void rdma_put_gid_attr(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + + put_gid_entry(entry); +} +EXPORT_SYMBOL(rdma_put_gid_attr); + +/** + * rdma_hold_gid_attr - Get reference to existing GID attribute + * + * @attr: Pointer to the GID attribute whose reference + * needs to be taken. + * + * Increase the reference count to a GID attribute to keep it from being + * freed. Callers are required to already be holding a reference to attribute. + * + */ +void rdma_hold_gid_attr(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + + get_gid_entry(entry); +} +EXPORT_SYMBOL(rdma_hold_gid_attr); + +/** + * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice + * which must be in UP state. + * + * @attr:Pointer to the GID attribute + * + * Returns pointer to netdevice if the netdevice was attached to GID and + * netdevice is in UP state. Caller must hold RCU lock as this API + * reads the netdev flags which can change while netdevice migrates to + * different net namespace. Returns ERR_PTR with error code otherwise. + * + */ +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) { + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + struct ib_device *device = entry->attr.device; + struct net_device *ndev = ERR_PTR(-EINVAL); + u32 port_num = entry->attr.port_num; + struct ib_gid_table *table; unsigned long flags; + bool valid; + + table = rdma_gid_table(device, port_num); + + read_lock_irqsave(&table->rwlock, flags); + valid = is_gid_entry_valid(table->data_vec[attr->index]); + if (valid) { + ndev = rcu_dereference(attr->ndev); + if (!ndev) + ndev = ERR_PTR(-ENODEV); + } + read_unlock_irqrestore(&table->rwlock, flags); + return ndev; +} +EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); + +static int get_lower_dev_vlan(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + u16 *vlan_id = (u16 *)priv->data; + + if (is_vlan_dev(lower_dev)) + *vlan_id = vlan_dev_vlan_id(lower_dev); + + /* We are interested only in first level vlan device, so + * always return 1 to stop iterating over next level devices. + */ + return 1; +} + +/** + * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address + * of a GID entry. + * + * @attr: GID attribute pointer whose L2 fields to be read + * @vlan_id: Pointer to vlan id to fill up if the GID entry has + * vlan id. It is optional. + * @smac: Pointer to smac to fill up for a GID entry. It is optional. + * + * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id + * (if gid entry has vlan) and source MAC, or returns error. + */ +int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, + u16 *vlan_id, u8 *smac) +{ + struct netdev_nested_priv priv = { + .data = (void *)vlan_id, + }; + struct net_device *ndev; + + rcu_read_lock(); + ndev = rcu_dereference(attr->ndev); + if (!ndev) { + rcu_read_unlock(); + return -ENODEV; + } + if (smac) + ether_addr_copy(smac, ndev->dev_addr); + if (vlan_id) { + *vlan_id = 0xffff; + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + } else { + /* If the netdev is upper device and if it's lower + * device is vlan device, consider vlan id of + * the lower vlan device for this gid entry. + */ + netdev_walk_all_lower_dev_rcu(attr->ndev, + get_lower_dev_vlan, &priv); + } + } + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(rdma_read_gid_l2_fields); + +static int config_non_roce_gid_cache(struct ib_device *device, + u32 port, struct ib_port_attr *tprops) +{ + struct ib_gid_attr gid_attr = {}; + struct ib_gid_table *table; int ret = 0; + int i; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) - return -EINVAL; + gid_attr.device = device; + gid_attr.port_num = port; + table = rdma_gid_table(device, port); - read_lock_irqsave(&device->cache.lock, flags); - *port_state = device->cache.ports[port_num - - rdma_start_port(device)].port_state; - read_unlock_irqrestore(&device->cache.lock, flags); + mutex_lock(&table->lock); + for (i = 0; i < tprops->gid_tbl_len; ++i) { + if (!device->ops.query_gid) + continue; + ret = device->ops.query_gid(device, port, i, &gid_attr.gid); + if (ret) { + dev_warn(&device->dev, + "query_gid failed (%d) for index %d\n", ret, + i); + goto err; + } + + if (rdma_protocol_iwarp(device, port)) { + struct net_device *ndev; + + ndev = ib_device_get_netdev(device, port); + if (!ndev) + continue; + RCU_INIT_POINTER(gid_attr.ndev, ndev); + dev_put(ndev); + } + gid_attr.index = i; + tprops->subnet_prefix = + be64_to_cpu(gid_attr.gid.global.subnet_prefix); + add_modify_gid(table, &gid_attr); + } +err: + mutex_unlock(&table->lock); return ret; } -EXPORT_SYMBOL(ib_get_cached_port_state); -static void ib_cache_update(struct ib_device *device, - u8 port, - bool enforce_security) +static int +ib_cache_update(struct ib_device *device, u32 port, bool update_gids, + bool update_pkeys, bool enforce_security) { struct ib_port_attr *tprops = NULL; - struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; - struct ib_gid_cache { - int table_len; - union ib_gid table[0]; - } *gid_cache = NULL; + struct ib_pkey_cache *pkey_cache = NULL; + struct ib_pkey_cache *old_pkey_cache = NULL; int i; int ret; - struct ib_gid_table *table; - bool use_roce_gid_table = - rdma_cap_roce_gid_table(device, port); if (!rdma_is_port_valid(device, port)) - return; - - table = device->cache.ports[port - rdma_start_port(device)].gid; + return -EINVAL; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) - return; + return -ENOMEM; ret = ib_query_port(device, port, tprops); if (ret) { - pr_warn("ib_query_port failed (%d) for %s\n", - ret, device->name); + dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } - pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len * - sizeof *pkey_cache->table, GFP_KERNEL); - if (!pkey_cache) - goto err; - - pkey_cache->table_len = tprops->pkey_tbl_len; - - if (!use_roce_gid_table) { - gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * - sizeof(*gid_cache->table), GFP_KERNEL); - if (!gid_cache) + if (!rdma_protocol_roce(device, port) && update_gids) { + ret = config_non_roce_gid_cache(device, port, + tprops); + if (ret) goto err; - - gid_cache->table_len = tprops->gid_tbl_len; } - for (i = 0; i < pkey_cache->table_len; ++i) { - ret = ib_query_pkey(device, port, i, pkey_cache->table + i); - if (ret) { - pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); + update_pkeys &= !!tprops->pkey_tbl_len; + + if (update_pkeys) { + pkey_cache = kmalloc(struct_size(pkey_cache, table, + tprops->pkey_tbl_len), + GFP_KERNEL); + if (!pkey_cache) { + ret = -ENOMEM; goto err; } - } - if (!use_roce_gid_table) { - for (i = 0; i < gid_cache->table_len; ++i) { - ret = ib_query_gid(device, port, i, - gid_cache->table + i, NULL); + pkey_cache->table_len = tprops->pkey_tbl_len; + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, + pkey_cache->table + i); if (ret) { - pr_warn("ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); goto err; } } } - write_lock_irq(&device->cache.lock); - - old_pkey_cache = device->cache.ports[port - - rdma_start_port(device)].pkey; + write_lock_irq(&device->cache_lock); - device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache; - if (!use_roce_gid_table) { - write_lock(&table->rwlock); - for (i = 0; i < gid_cache->table_len; i++) { - modify_gid(device, port, table, i, gid_cache->table + i, - &zattr, false); - } - write_unlock(&table->rwlock); + if (update_pkeys) { + old_pkey_cache = device->port_data[port].cache.pkey; + device->port_data[port].cache.pkey = pkey_cache; } + device->port_data[port].cache.lmc = tprops->lmc; + + if (device->port_data[port].cache.port_state != IB_PORT_NOP && + device->port_data[port].cache.port_state != tprops->state) + ibdev_info(device, "Port: %d Link %s\n", port, + ib_port_state_to_str(tprops->state)); - device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc; - device->cache.ports[port - rdma_start_port(device)].port_state = - tprops->state; + device->port_data[port].cache.port_state = tprops->state; - device->cache.ports[port - rdma_start_port(device)].subnet_prefix = - tprops->subnet_prefix; - write_unlock_irq(&device->cache.lock); + device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; + write_unlock_irq(&device->cache_lock); if (enforce_security) ib_security_cache_change(device, port, tprops->subnet_prefix); - kfree(gid_cache); kfree(old_pkey_cache); kfree(tprops); - return; + return 0; err: kfree(pkey_cache); - kfree(gid_cache); kfree(tprops); + return ret; } -static void ib_cache_task(struct work_struct *_work) +static void ib_cache_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); + int ret; + + /* Before distributing the cache update event, first sync + * the cache. + */ + ret = ib_cache_update(work->event.device, work->event.element.port_num, + work->event.event == IB_EVENT_GID_CHANGE, + work->event.event == IB_EVENT_PKEY_CHANGE, + work->enforce_security); + + /* GID event is notified already for individual GID entries by + * dispatch_gid_change_event(). Hence, notifiy for rest of the + * events. + */ + if (!ret && work->event.event != IB_EVENT_GID_CHANGE) + ib_dispatch_event_clients(&work->event); - ib_cache_update(work->device, - work->port_num, - work->enforce_security); kfree(work); } -static void ib_cache_event(struct ib_event_handler *handler, - struct ib_event *event) +static void ib_generic_event_task(struct work_struct *_work) +{ + struct ib_update_work *work = + container_of(_work, struct ib_update_work, work); + + ib_dispatch_event_clients(&work->event); + kfree(work); +} + +static bool is_cache_update_event(const struct ib_event *event) +{ + return (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_GID_CHANGE); +} + +/** + * ib_dispatch_event - Dispatch an asynchronous event + * @event:Event to dispatch + * + * Low-level drivers must call ib_dispatch_event() to dispatch the + * event to all registered event handlers when an asynchronous event + * occurs. + */ +void ib_dispatch_event(const struct ib_event *event) { struct ib_update_work *work; - if (event->event == IB_EVENT_PORT_ERR || - event->event == IB_EVENT_PORT_ACTIVE || - event->event == IB_EVENT_LID_CHANGE || - event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_SM_CHANGE || - event->event == IB_EVENT_CLIENT_REREGISTER || - event->event == IB_EVENT_GID_CHANGE) { - work = kmalloc(sizeof *work, GFP_ATOMIC); - if (work) { - INIT_WORK(&work->work, ib_cache_task); - work->device = event->device; - work->port_num = event->element.port_num; - if (event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_GID_CHANGE) - work->enforce_security = true; - else - work->enforce_security = false; - - queue_work(ib_wq, &work->work); - } - } + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return; + + if (is_cache_update_event(event)) + INIT_WORK(&work->work, ib_cache_event_task); + else + INIT_WORK(&work->work, ib_generic_event_task); + + work->event = *event; + if (event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_GID_CHANGE) + work->enforce_security = true; + + queue_work(ib_wq, &work->work); } +EXPORT_SYMBOL(ib_dispatch_event); int ib_cache_setup_one(struct ib_device *device) { - int p; + u32 p; int err; - rwlock_init(&device->cache.lock); - - device->cache.ports = - kzalloc(sizeof(*device->cache.ports) * - (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); - if (!device->cache.ports) { - err = -ENOMEM; - goto out; - } - err = gid_table_setup_one(device); if (err) - goto out; - - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - ib_cache_update(device, p + rdma_start_port(device), true); + return err; - INIT_IB_EVENT_HANDLER(&device->cache.event_handler, - device, ib_cache_event); - err = ib_register_event_handler(&device->cache.event_handler); - if (err) - goto err; + rdma_for_each_port (device, p) { + err = ib_cache_update(device, p, true, true, true); + if (err) { + gid_table_cleanup_one(device); + return err; + } + } return 0; - -err: - gid_table_cleanup_one(device); -out: - return err; } void ib_cache_release_one(struct ib_device *device) { - int p; + u32 p; /* * The release function frees all the cache elements. @@ -1235,33 +1630,25 @@ void ib_cache_release_one(struct ib_device *device) * all the device's resources when the cache could no * longer be accessed. */ - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - kfree(device->cache.ports[p].pkey); + rdma_for_each_port (device, p) + kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); - kfree(device->cache.ports); } void ib_cache_cleanup_one(struct ib_device *device) { - /* The cleanup function unregisters the event handler, - * waits for all in-progress workqueue elements and cleans - * up the GID cache. This function should be called after - * the device was removed from the devices list and all - * clients were removed, so the cache exists but is + /* The cleanup function waits for all in-progress workqueue + * elements and cleans up the GID cache. This function should be + * called after the device was removed from the devices list and + * all clients were removed, so the cache exists but is * non-functional and shouldn't be updated anymore. */ - ib_unregister_event_handler(&device->cache.event_handler); flush_workqueue(ib_wq); gid_table_cleanup_one(device); -} -void __init ib_cache_setup(void) -{ - roce_gid_mgmt_init(); -} - -void __exit ib_cache_cleanup(void) -{ - roce_gid_mgmt_cleanup(); + /* + * Flush the wq second time for any pending GID delete work. + */ + flush_workqueue(ib_wq); } |
