diff options
Diffstat (limited to 'drivers/infiniband/hw/hfi1/user_exp_rcv.c')
| -rw-r--r-- | drivers/infiniband/hw/hfi1/user_exp_rcv.c | 779 |
1 files changed, 338 insertions, 441 deletions
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index a8f0aa4722f6..62b4f16dab27 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -1,212 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright(c) 2015-2017 Intel Corporation. - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * + * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2015-2018 Intel Corporation. */ #include <asm/page.h> #include <linux/string.h> +#include "mmu_rb.h" #include "user_exp_rcv.h" #include "trace.h" -#include "mmu_rb.h" - -struct tid_group { - struct list_head list; - u32 base; - u8 size; - u8 used; - u8 map; -}; - -struct tid_rb_node { - struct mmu_rb_node mmu; - unsigned long phys; - struct tid_group *grp; - u32 rcventry; - dma_addr_t dma_addr; - bool freed; - unsigned npages; - struct page *pages[0]; -}; - -struct tid_pageset { - u16 idx; - u16 count; -}; - -#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) - -#define num_user_pages(vaddr, len) \ - (1 + (((((unsigned long)(vaddr) + \ - (unsigned long)(len) - 1) & PAGE_MASK) - \ - ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, struct exp_tid_set *set, struct hfi1_filedata *fd); -static u32 find_phys_blocks(struct page **pages, unsigned npages, - struct tid_pageset *list); -static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr, +static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages); +static int set_rcvarray_entry(struct hfi1_filedata *fd, + struct tid_user_buf *tbuf, u32 rcventry, struct tid_group *grp, - struct page **pages, unsigned npages); -static int tid_rb_insert(void *arg, struct mmu_rb_node *node); + u16 pageidx, unsigned int npages); static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, struct tid_rb_node *tnode); -static void tid_rb_remove(void *arg, struct mmu_rb_node *node); -static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode); -static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, - struct tid_group *grp, struct tid_pageset *sets, - unsigned start, u16 count, struct page **pages, - u32 *tidlist, unsigned *tididx, unsigned *pmapped); -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp); +static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); +static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, + struct tid_group *grp, u16 count, + u32 *tidlist, unsigned int *tididx, + unsigned int *pmapped); +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); +static void __clear_tid_node(struct hfi1_filedata *fd, + struct tid_rb_node *node); static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); -static struct mmu_rb_ops tid_rb_ops = { - .insert = tid_rb_insert, - .remove = tid_rb_remove, - .invalidate = tid_rb_invalidate +static const struct mmu_interval_notifier_ops tid_mn_ops = { + .invalidate = tid_rb_invalidate, +}; +static const struct mmu_interval_notifier_ops tid_cover_ops = { + .invalidate = tid_cover_invalidate, }; - -static inline u32 rcventry2tidinfo(u32 rcventry) -{ - u32 pair = rcventry & ~0x1; - - return EXP_TID_SET(IDX, pair >> 1) | - EXP_TID_SET(CTRL, 1 << (rcventry - pair)); -} - -static inline void exp_tid_group_init(struct exp_tid_set *set) -{ - INIT_LIST_HEAD(&set->list); - set->count = 0; -} - -static inline void tid_group_remove(struct tid_group *grp, - struct exp_tid_set *set) -{ - list_del_init(&grp->list); - set->count--; -} - -static inline void tid_group_add_tail(struct tid_group *grp, - struct exp_tid_set *set) -{ - list_add_tail(&grp->list, &set->list); - set->count++; -} - -static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) -{ - struct tid_group *grp = - list_first_entry(&set->list, struct tid_group, list); - list_del_init(&grp->list); - set->count--; - return grp; -} - -static inline void tid_group_move(struct tid_group *group, - struct exp_tid_set *s1, - struct exp_tid_set *s2) -{ - tid_group_remove(group, s1); - tid_group_add_tail(group, s2); -} - -int hfi1_user_exp_rcv_grp_init(struct hfi1_filedata *fd) -{ - struct hfi1_ctxtdata *uctxt = fd->uctxt; - struct hfi1_devdata *dd = fd->dd; - u32 tidbase; - u32 i; - struct tid_group *grp, *gptr; - - exp_tid_group_init(&uctxt->tid_group_list); - exp_tid_group_init(&uctxt->tid_used_list); - exp_tid_group_init(&uctxt->tid_full_list); - - tidbase = uctxt->expected_base; - for (i = 0; i < uctxt->expected_count / - dd->rcv_entries.group_size; i++) { - grp = kzalloc(sizeof(*grp), GFP_KERNEL); - if (!grp) - goto grp_failed; - - grp->size = dd->rcv_entries.group_size; - grp->base = tidbase; - tid_group_add_tail(grp, &uctxt->tid_group_list); - tidbase += dd->rcv_entries.group_size; - } - - return 0; - -grp_failed: - list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, - list) { - list_del_init(&grp->list); - kfree(grp); - } - - return -ENOMEM; -} /* * Initialize context and file private data needed for Expected * receive caching. This needs to be done after the context has * been configured with the eager/expected RcvEntry counts. */ -int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd) +int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, + struct hfi1_ctxtdata *uctxt) { - struct hfi1_ctxtdata *uctxt = fd->uctxt; - struct hfi1_devdata *dd = uctxt->dd; int ret = 0; - spin_lock_init(&fd->tid_lock); - spin_lock_init(&fd->invalid_lock); - fd->entry_to_rb = kcalloc(uctxt->expected_count, - sizeof(struct rb_node *), + sizeof(*fd->entry_to_rb), GFP_KERNEL); if (!fd->entry_to_rb) return -ENOMEM; @@ -221,20 +68,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd) fd->entry_to_rb = NULL; return -ENOMEM; } - - /* - * Register MMU notifier callbacks. If the registration - * fails, continue without TID caching for this context. - */ - ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops, - dd->pport->hfi1_wq, - &fd->handler); - if (ret) { - dd_dev_info(dd, - "Failed MMU notifier registration %d\n", - ret); - ret = 0; - } + fd->use_mn = true; } /* @@ -251,7 +85,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd) * init. */ spin_lock(&fd->tid_lock); - if (uctxt->subctxt_cnt && fd->handler) { + if (uctxt->subctxt_cnt && fd->use_mn) { u16 remainder; fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; @@ -266,34 +100,16 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd) return ret; } -void hfi1_user_exp_rcv_grp_free(struct hfi1_ctxtdata *uctxt) -{ - struct tid_group *grp, *gptr; - - list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, - list) { - list_del_init(&grp->list); - kfree(grp); - } - hfi1_clear_tids(uctxt); -} - void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) { struct hfi1_ctxtdata *uctxt = fd->uctxt; - /* - * The notifier would have been removed when the process'es mm - * was freed. - */ - if (fd->handler) { - hfi1_mmu_rb_unregister(fd->handler); - } else { - if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) - unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); - if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) - unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); - } + mutex_lock(&uctxt->exp_mutex); + if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) + unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); + if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) + unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); + mutex_unlock(&uctxt->exp_mutex); kfree(fd->invalid_tids); fd->invalid_tids = NULL; @@ -303,20 +119,79 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) } /* - * Write an "empty" RcvArray entry. - * This function exists so the TID registaration code can use it - * to write to unused/unneeded entries and still take advantage - * of the WC performance improvements. The HFI will ignore this - * write to the RcvArray entry. + * Release pinned receive buffer pages. + * + * @mapped: true if the pages have been DMA mapped. false otherwise. + * @idx: Index of the first page to unpin. + * @npages: No of pages to unpin. + * + * If the pages have been DMA mapped (indicated by mapped parameter), their + * info will be passed via a struct tid_rb_node. If they haven't been mapped, + * their info will be passed via a struct tid_user_buf. */ -static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +static void unpin_rcv_pages(struct hfi1_filedata *fd, + struct tid_user_buf *tidbuf, + struct tid_rb_node *node, + unsigned int idx, + unsigned int npages, + bool mapped) { + struct page **pages; + struct hfi1_devdata *dd = fd->uctxt->dd; + struct mm_struct *mm; + + if (mapped) { + dma_unmap_single(&dd->pcidev->dev, node->dma_addr, + node->npages * PAGE_SIZE, DMA_FROM_DEVICE); + pages = &node->pages[idx]; + mm = mm_from_tid_node(node); + } else { + pages = &tidbuf->pages[idx]; + mm = current->mm; + } + hfi1_release_user_pages(mm, pages, npages, mapped); + fd->tid_n_pinned -= npages; +} + +/* + * Pin receive buffer pages. + */ +static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf) +{ + int pinned; + unsigned int npages = tidbuf->npages; + unsigned long vaddr = tidbuf->vaddr; + struct page **pages = NULL; + struct hfi1_devdata *dd = fd->uctxt->dd; + + if (npages > fd->uctxt->expected_count) { + dd_dev_err(dd, "Expected buffer too big\n"); + return -EINVAL; + } + + /* Allocate the array of struct page pointers needed for pinning */ + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + /* - * Doing the WC fill writes only makes sense if the device is - * present and the RcvArray has been mapped as WC memory. + * Pin all the pages of the user buffer. If we can't pin all the + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. */ - if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) - writeq(0, dd->rcvarray_wc + (index * 8)); + if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) { + kfree(pages); + return -ENOMEM; + } + + pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages); + if (pinned <= 0) { + kfree(pages); + return pinned; + } + tidbuf->pages = pages; + fd->tid_n_pinned += pinned; + return pinned; } /* @@ -374,83 +249,70 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, int ret = 0, need_group = 0, pinned; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; - unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, + unsigned int ngroups, pageset_count, tididx = 0, mapped, mapped_pages = 0; - unsigned long vaddr = tinfo->vaddr; - struct page **pages = NULL; u32 *tidlist = NULL; - struct tid_pageset *pagesets = NULL; + struct tid_user_buf *tidbuf; + unsigned long mmu_seq = 0; - /* Get the number of pages the user buffer spans */ - npages = num_user_pages(vaddr, tinfo->length); - if (!npages) + if (!PAGE_ALIGNED(tinfo->vaddr)) return -EINVAL; - - if (npages > uctxt->expected_count) { - dd_dev_err(dd, "Expected buffer too big\n"); + if (tinfo->length == 0) return -EINVAL; - } - - /* Verify that access is OK for the user buffer */ - if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, - npages * PAGE_SIZE)) { - dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", - (void *)vaddr, npages); - return -EFAULT; - } - pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), - GFP_KERNEL); - if (!pagesets) + tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); + if (!tidbuf) return -ENOMEM; - /* Allocate the array of struct page pointers needed for pinning */ - pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); - if (!pages) { + mutex_init(&tidbuf->cover_mutex); + tidbuf->vaddr = tinfo->vaddr; + tidbuf->length = tinfo->length; + tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length); + tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), + GFP_KERNEL); + if (!tidbuf->psets) { ret = -ENOMEM; - goto bail; + goto fail_release_mem; } - /* - * Pin all the pages of the user buffer. If we can't pin all the - * pages, accept the amount pinned so far and program only that. - * User space knows how to deal with partially programmed buffers. - */ - if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) { - ret = -ENOMEM; - goto bail; + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &tidbuf->notifier, current->mm, + tidbuf->vaddr, tidbuf->npages * PAGE_SIZE, + &tid_cover_ops); + if (ret) + goto fail_release_mem; + mmu_seq = mmu_interval_read_begin(&tidbuf->notifier); } - pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages); + pinned = pin_rcv_pages(fd, tidbuf); if (pinned <= 0) { - ret = pinned; - goto bail; + ret = (pinned < 0) ? pinned : -ENOSPC; + goto fail_unpin; } - fd->tid_n_pinned += npages; /* Find sets of physically contiguous pages */ - npagesets = find_phys_blocks(pages, pinned, pagesets); + tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); - /* - * We don't need to access this under a lock since tid_used is per - * process and the same process cannot be in hfi1_user_exp_rcv_clear() - * and hfi1_user_exp_rcv_setup() at the same time. - */ + /* Reserve the number of expected tids to be used. */ spin_lock(&fd->tid_lock); - if (fd->tid_used + npagesets > fd->tid_limit) + if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) pageset_count = fd->tid_limit - fd->tid_used; else - pageset_count = npagesets; + pageset_count = tidbuf->n_psets; + fd->tid_used += pageset_count; spin_unlock(&fd->tid_lock); - if (!pageset_count) - goto bail; + if (!pageset_count) { + ret = -ENOSPC; + goto fail_unreserve; + } ngroups = pageset_count / dd->rcv_entries.group_size; tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); if (!tidlist) { ret = -ENOMEM; - goto nomem; + goto fail_unreserve; } tididx = 0; @@ -459,7 +321,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, * From this point on, we are going to be using shared (between master * and subcontexts) context resources. We need to take the lock. */ - mutex_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_mutex); /* * The first step is to program the RcvArray entries which are complete * groups. @@ -468,9 +330,9 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, struct tid_group *grp = tid_group_pop(&uctxt->tid_group_list); - ret = program_rcvarray(fd, vaddr, grp, pagesets, - pageidx, dd->rcv_entries.group_size, - pages, tidlist, &tididx, &mapped); + ret = program_rcvarray(fd, tidbuf, grp, + dd->rcv_entries.group_size, + tidlist, &tididx, &mapped); /* * If there was a failure to program the RcvArray * entries for the entire group, reset the grp fields @@ -485,11 +347,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_add_tail(grp, &uctxt->tid_full_list); ngroups--; - pageidx += ret; mapped_pages += mapped; } - while (pageidx < pageset_count) { + while (tididx < pageset_count) { struct tid_group *grp, *ptr; /* * If we don't have any partially used tid groups, check @@ -511,28 +372,26 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, */ list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, list) { - unsigned use = min_t(unsigned, pageset_count - pageidx, + unsigned use = min_t(unsigned, pageset_count - tididx, grp->size - grp->used); - ret = program_rcvarray(fd, vaddr, grp, pagesets, - pageidx, use, pages, tidlist, + ret = program_rcvarray(fd, tidbuf, grp, + use, tidlist, &tididx, &mapped); if (ret < 0) { hfi1_cdbg(TID, "Failed to program RcvArray entries %d", ret); - ret = -EFAULT; goto unlock; } else if (ret > 0) { if (grp->used == grp->size) tid_group_move(grp, &uctxt->tid_used_list, &uctxt->tid_full_list); - pageidx += ret; mapped_pages += mapped; need_group = 0; /* Check if we are done so we break out early */ - if (pageidx >= pageset_count) + if (tididx >= pageset_count) break; } else if (WARN_ON(ret == 0)) { /* @@ -546,46 +405,79 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, } } unlock: - mutex_unlock(&uctxt->exp_lock); -nomem: + mutex_unlock(&uctxt->exp_mutex); hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, mapped_pages, ret); - if (tididx) { - spin_lock(&fd->tid_lock); - fd->tid_used += tididx; - spin_unlock(&fd->tid_lock); - tinfo->tidcnt = tididx; - tinfo->length = mapped_pages * PAGE_SIZE; - - if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, - tidlist, sizeof(tidlist[0]) * tididx)) { - /* - * On failure to copy to the user level, we need to undo - * everything done so far so we don't leak resources. - */ - tinfo->tidlist = (unsigned long)&tidlist; - hfi1_user_exp_rcv_clear(fd, tinfo); - tinfo->tidlist = 0; - ret = -EFAULT; - goto bail; + + /* fail if nothing was programmed, set error if none provided */ + if (tididx == 0) { + if (ret >= 0) + ret = -ENOSPC; + goto fail_unreserve; + } + + /* adjust reserved tid_used to actual count */ + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count - tididx; + spin_unlock(&fd->tid_lock); + + /* unpin all pages not covered by a TID */ + unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, + false); + + if (fd->use_mn) { + /* check for an invalidate during setup */ + bool fail = false; + + mutex_lock(&tidbuf->cover_mutex); + fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq); + mutex_unlock(&tidbuf->cover_mutex); + + if (fail) { + ret = -EBUSY; + goto fail_unprogram; } } - /* - * If not everything was mapped (due to insufficient RcvArray entries, - * for example), unpin all unmapped pages so we can pin them nex time. - */ - if (mapped_pages != pinned) { - hfi1_release_user_pages(fd->mm, &pages[mapped_pages], - pinned - mapped_pages, - false); - fd->tid_n_pinned -= pinned - mapped_pages; + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), + tidlist, sizeof(tidlist[0]) * tididx)) { + ret = -EFAULT; + goto fail_unprogram; } -bail: - kfree(pagesets); - kfree(pages); + + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + kfree(tidbuf->pages); + kfree(tidbuf->psets); + kfree(tidbuf); kfree(tidlist); - return ret > 0 ? 0 : ret; + return 0; + +fail_unprogram: + /* unprogram, unmap, and unpin all allocated TIDs */ + tinfo->tidlist = (unsigned long)tidlist; + hfi1_user_exp_rcv_clear(fd, tinfo); + tinfo->tidlist = 0; + pinned = 0; /* nothing left to unpin */ + pageset_count = 0; /* nothing left reserved */ +fail_unreserve: + spin_lock(&fd->tid_lock); + fd->tid_used -= pageset_count; + spin_unlock(&fd->tid_lock); +fail_unpin: + if (fd->use_mn) + mmu_interval_notifier_remove(&tidbuf->notifier); + if (pinned > 0) + unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); +fail_release_mem: + kfree(tidbuf->pages); + kfree(tidbuf->psets); + kfree(tidbuf); + kfree(tidlist); + return ret; } int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, @@ -599,14 +491,14 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, if (unlikely(tinfo->tidcnt > fd->tid_used)) return -EINVAL; - tidinfo = memdup_user((void __user *)(unsigned long)tinfo->tidlist, - sizeof(tidinfo[0]) * tinfo->tidcnt); + tidinfo = memdup_array_user(u64_to_user_ptr(tinfo->tidlist), + tinfo->tidcnt, sizeof(tidinfo[0])); if (IS_ERR(tidinfo)) return PTR_ERR(tidinfo); - mutex_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_mutex); for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { - ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); + ret = unprogram_rcvarray(fd, tidinfo[tididx]); if (ret) { hfi1_cdbg(TID, "Failed to unprogram rcv array %d", ret); @@ -617,7 +509,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, fd->tid_used -= tididx; spin_unlock(&fd->tid_lock); tinfo->tidcnt = tididx; - mutex_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_mutex); kfree(tidinfo); return ret; @@ -628,14 +520,10 @@ int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, { struct hfi1_ctxtdata *uctxt = fd->uctxt; unsigned long *ev = uctxt->dd->events + - (((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) * - HFI1_MAX_SHARED_CTXTS) + fd->subctxt); + (uctxt_offset(uctxt) + fd->subctxt); u32 *array; int ret = 0; - if (!fd->invalid_tids) - return -EINVAL; - /* * copy_to_user() can sleep, which will leave the invalid_lock * locked and cause the MMU notifier to be blocked on the lock @@ -674,11 +562,12 @@ int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, return ret; } -static u32 find_phys_blocks(struct page **pages, unsigned npages, - struct tid_pageset *list) +static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) { unsigned pagecount, pageidx, setcount = 0, i; unsigned long pfn, this_pfn; + struct page **pages = tidbuf->pages; + struct tid_pageset *list = tidbuf->psets; if (!npages) return 0; @@ -741,13 +630,12 @@ static u32 find_phys_blocks(struct page **pages, unsigned npages, /** * program_rcvarray() - program an RcvArray group with receive buffers * @fd: filedata pointer - * @vaddr: starting user virtual address + * @tbuf: pointer to struct tid_user_buf that has the user buffer starting + * virtual address, buffer length, page pointers, pagesets (array of + * struct tid_pageset holding information on physically contiguous + * chunks from the user buffer), and other fields. * @grp: RcvArray group - * @sets: array of struct tid_pageset holding information on physically - * contiguous chunks from the user buffer - * @start: starting index into sets array * @count: number of struct tid_pageset's to program - * @pages: an array of struct page * for the user buffer * @tidlist: the array of u32 elements when the information about the * programmed RcvArray entries is to be encoded. * @tididx: starting offset into tidlist @@ -765,15 +653,15 @@ static u32 find_phys_blocks(struct page **pages, unsigned npages, * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or * number of RcvArray entries programmed. */ -static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, - struct tid_group *grp, - struct tid_pageset *sets, - unsigned start, u16 count, struct page **pages, - u32 *tidlist, unsigned *tididx, unsigned *pmapped) +static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, + struct tid_group *grp, u16 count, + u32 *tidlist, unsigned int *tididx, + unsigned int *pmapped) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; u16 idx; + unsigned int start = *tididx; u32 tidinfo = 0, rcventry, useidx = 0; int mapped = 0; @@ -808,18 +696,17 @@ static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, } rcventry = grp->base + useidx; - npages = sets[setidx].count; - pageidx = sets[setidx].idx; + npages = tbuf->psets[setidx].count; + pageidx = tbuf->psets[setidx].idx; - ret = set_rcvarray_entry(fd, vaddr + (pageidx * PAGE_SIZE), - rcventry, grp, pages + pageidx, + ret = set_rcvarray_entry(fd, tbuf, + rcventry, grp, pageidx, npages); if (ret) return ret; mapped += npages; - tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | - EXP_TID_SET(LEN, npages); + tidinfo = create_tid(rcventry - uctxt->expected_base, npages); tidlist[(*tididx)++] = tidinfo; grp->used++; grp->map |= 1 << useidx++; @@ -833,28 +720,28 @@ static int program_rcvarray(struct hfi1_filedata *fd, unsigned long vaddr, return idx; } -static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr, +static int set_rcvarray_entry(struct hfi1_filedata *fd, + struct tid_user_buf *tbuf, u32 rcventry, struct tid_group *grp, - struct page **pages, unsigned npages) + u16 pageidx, unsigned int npages) { int ret; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct tid_rb_node *node; struct hfi1_devdata *dd = uctxt->dd; dma_addr_t phys; + struct page **pages = tbuf->pages + pageidx; /* * Allocate the node first so we can handle a potential * failure before we've programmed anything. */ - node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), - GFP_KERNEL); + node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL); if (!node) return -ENOMEM; - phys = pci_map_single(dd->pcidev, - __va(page_to_phys(pages[0])), - npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); + phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])), + npages * PAGE_SIZE, DMA_FROM_DEVICE); if (dma_mapping_error(&dd->pcidev->dev, phys)) { dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", phys); @@ -862,90 +749,100 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, unsigned long vaddr, return -EFAULT; } - node->mmu.addr = vaddr; - node->mmu.len = npages * PAGE_SIZE; + node->fdata = fd; + mutex_init(&node->invalidate_mutex); node->phys = page_to_phys(pages[0]); node->npages = npages; node->rcventry = rcventry; node->dma_addr = phys; node->grp = grp; node->freed = false; - memcpy(node->pages, pages, sizeof(struct page *) * npages); + memcpy(node->pages, pages, flex_array_size(node, pages, npages)); - if (!fd->handler) - ret = tid_rb_insert(fd, &node->mmu); - else - ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu); - - if (ret) { - hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", - node->rcventry, node->mmu.addr, node->phys, ret); - pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, - PCI_DMA_FROMDEVICE); - kfree(node); - return -EFAULT; + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &node->notifier, current->mm, + tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE, + &tid_mn_ops); + if (ret) + goto out_unmap; } + fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; + hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, - node->mmu.addr, node->phys, phys); + node->notifier.interval_tree.start, node->phys, + phys); return 0; + +out_unmap: + hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", + node->rcventry, node->notifier.interval_tree.start, + node->phys, ret); + dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE, + DMA_FROM_DEVICE); + kfree(node); + return -EFAULT; } -static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, - struct tid_group **grp) +static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; struct tid_rb_node *node; - u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tidctrl = EXP_TID_GET(tidinfo, CTRL); u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; - if (tididx >= uctxt->expected_count) { - dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", - tididx, uctxt->ctxt); - return -EINVAL; - } - - if (tidctrl == 0x3) + if (tidctrl == 0x3 || tidctrl == 0x0) return -EINVAL; rcventry = tididx + (tidctrl - 1); + if (rcventry >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + rcventry, uctxt->ctxt); + return -EINVAL; + } + node = fd->entry_to_rb[rcventry]; if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; - if (grp) - *grp = node->grp; - - if (!fd->handler) - cacheless_tid_rb_remove(fd, node); - else - hfi1_mmu_rb_remove(fd->handler, &node->mmu); + if (fd->use_mn) + mmu_interval_notifier_remove(&node->notifier); + cacheless_tid_rb_remove(fd, node); return 0; } -static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; + mutex_lock(&node->invalidate_mutex); + if (node->freed) + goto done; + node->freed = true; + trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, - node->npages, node->mmu.addr, node->phys, + node->npages, + node->notifier.interval_tree.start, node->phys, node->dma_addr); - hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); - /* - * Make sure device has seen the write before we unpin the - * pages. - */ - flush_wc(); + /* Make sure device has seen the write before pages are unpinned */ + hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); + + unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); +done: + mutex_unlock(&node->invalidate_mutex); +} + +static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; - pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len, - PCI_DMA_FROMDEVICE); - hfi1_release_user_pages(fd->mm, node->pages, node->npages, true); - fd->tid_n_pinned -= node->npages; + __clear_tid_node(fd, node); node->grp->used--; node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); @@ -983,39 +880,43 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, if (!node || node->rcventry != rcventry) continue; + if (fd->use_mn) + mmu_interval_notifier_remove( + &node->notifier); cacheless_tid_rb_remove(fd, node); } } } } -/* - * Always return 0 from this function. A non-zero return indicates that the - * remove operation will be called and that memory should be unpinned. - * However, the driver cannot unpin out from under PSM. Instead, retain the - * memory (by returning 0) and inform PSM that the memory is going away. PSM - * will call back later when it has removed the memory from its list. - */ -static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode) +static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { - struct hfi1_filedata *fdata = arg; - struct hfi1_ctxtdata *uctxt = fdata->uctxt; struct tid_rb_node *node = - container_of(mnode, struct tid_rb_node, mmu); + container_of(mni, struct tid_rb_node, notifier); + struct hfi1_filedata *fdata = node->fdata; + struct hfi1_ctxtdata *uctxt = fdata->uctxt; if (node->freed) - return 0; + return true; - trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr, + /* take action only if unmapping */ + if (range->event != MMU_NOTIFY_UNMAP) + return true; + + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, + node->notifier.interval_tree.start, node->rcventry, node->npages, node->dma_addr); - node->freed = true; + + /* clear the hardware rcvarray entry */ + __clear_tid_node(fdata, node); spin_lock(&fdata->invalid_lock); if (fdata->invalid_tid_idx < uctxt->expected_count) { fdata->invalid_tids[fdata->invalid_tid_idx] = - rcventry2tidinfo(node->rcventry - uctxt->expected_base); - fdata->invalid_tids[fdata->invalid_tid_idx] |= - EXP_TID_SET(LEN, node->npages); + create_tid(node->rcventry - uctxt->expected_base, + node->npages); if (!fdata->invalid_tid_idx) { unsigned long *ev; @@ -1029,25 +930,30 @@ static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode) * process in question. */ ev = uctxt->dd->events + - (((uctxt->ctxt - uctxt->dd->first_dyn_alloc_ctxt) * - HFI1_MAX_SHARED_CTXTS) + fdata->subctxt); + (uctxt_offset(uctxt) + fdata->subctxt); set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); } fdata->invalid_tid_idx++; } spin_unlock(&fdata->invalid_lock); - return 0; + return true; } -static int tid_rb_insert(void *arg, struct mmu_rb_node *node) +static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { - struct hfi1_filedata *fdata = arg; - struct tid_rb_node *tnode = - container_of(node, struct tid_rb_node, mmu); - u32 base = fdata->uctxt->expected_base; + struct tid_user_buf *tidbuf = + container_of(mni, struct tid_user_buf, notifier); + + /* take action only if unmapping */ + if (range->event == MMU_NOTIFY_UNMAP) { + mutex_lock(&tidbuf->cover_mutex); + mmu_interval_set_seq(mni, cur_seq); + mutex_unlock(&tidbuf->cover_mutex); + } - fdata->entry_to_rb[tnode->rcventry - base] = tnode; - return 0; + return true; } static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, @@ -1058,12 +964,3 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, fdata->entry_to_rb[tnode->rcventry - base] = NULL; clear_tid_node(fdata, tnode); } - -static void tid_rb_remove(void *arg, struct mmu_rb_node *node) -{ - struct hfi1_filedata *fdata = arg; - struct tid_rb_node *tnode = - container_of(node, struct tid_rb_node, mmu); - - cacheless_tid_rb_remove(fdata, tnode); -} |
