mm: device exclusive memory access

Some devices require exclusive write access to shared virtual memory (SVM) ranges to perform atomic operations on that memory. This requires CPU page tables to be updated to deny access whilst atomic operations are occurring. In order to do this introduce a new swap entry type (SWP_DEVICE_EXCLUSIVE). When a SVM range needs to be marked for exclusive access by a device all page table mappings for the particular range are replaced with device exclusive swap entries. This causes any CPU access to the page to result in a fault. Faults are resovled by replacing the faulting entry with the original mapping. This results in MMU notifiers being called which a driver uses to update access permissions such as revoking atomic access. After notifiers have been called the device will no longer have exclusive access to the region. Walking of the page tables to find the target pages is handled by get_user_pages() rather than a direct page table walk. A direct page table walk similar to what migrate_vma_collect()/unmap() does could also have been utilised. However this resulted in more code similar in functionality to what get_user_pages() provides as page faulting is required to make the PTEs present and to break COW. [dan.carpenter@oracle.com: fix signedness bug in make_device_exclusive_range()] Link: https://lkml.kernel.org/r/YNIz5NVnZ5GiZ3u1@mwanda Link: https://lkml.kernel.org/r/20210616105937.23201-8-apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Peter Xu <peterx@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Shakeel Butt <shakeelb@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Alistair Popple <apopple@nvidia.com> 2021-06-30 18:54:25 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2021-07-01 11:06:03 -0700
commit: b756a3b5e7ead8f6f4b03cea8ac22478ce04c8a8 (patch)
tree: fc6e3f788e1144e48ddc15475dc18d44448d6e22 /mm/memory.c
parent: 9a5cc85c407402ae66128d31f0422a3a7ffa5c5c (diff)
1 files changed, 123 insertions, 4 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 75feddbf0190..747a01d495f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -699,6 +699,68 @@ out:
 }
 #endif
 
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+				  struct page *page, unsigned long address,
+				  pte_t *ptep)
+{
+	pte_t pte;
+	swp_entry_t entry;
+
+	pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+	if (pte_swp_soft_dirty(*ptep))
+		pte = pte_mksoft_dirty(pte);
+
+	entry = pte_to_swp_entry(*ptep);
+	if (pte_swp_uffd_wp(*ptep))
+		pte = pte_mkuffd_wp(pte);
+	else if (is_writable_device_exclusive_entry(entry))
+		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+	set_pte_at(vma->vm_mm, address, ptep, pte);
+
+	/*
+	 * No need to take a page reference as one was already
+	 * created when the swap entry was made.
+	 */
+	if (PageAnon(page))
+		page_add_anon_rmap(page, vma, address, false);
+	else
+		/*
+		 * Currently device exclusive access only supports anonymous
+		 * memory so the entry shouldn't point to a filebacked page.
+		 */
+		WARN_ON_ONCE(!PageAnon(page));
+
+	if (vma->vm_flags & VM_LOCKED)
+		mlock_vma_page(page);
+
+	/*
+	 * No need to invalidate - it was non-present before. However
+	 * secondary CPUs may have mappings that need invalidating.
+	 */
+	update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+			unsigned long addr)
+{
+	swp_entry_t entry = pte_to_swp_entry(*src_pte);
+	struct page *page = pfn_swap_entry_to_page(entry);
+
+	if (trylock_page(page)) {
+		restore_exclusive_pte(vma, page, addr, src_pte);
+		unlock_page(page);
+		return 0;
+	}
+
+	return -EBUSY;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -780,6 +842,17 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 				pte = pte_swp_mkuffd_wp(pte);
 			set_pte_at(src_mm, addr, src_pte, pte);
 		}
+	} else if (is_device_exclusive_entry(entry)) {
+		/*
+		 * Make device exclusive entries present by restoring the
+		 * original entry then copying as for a present pte. Device
+		 * exclusive entries currently only support private writable
+		 * (ie. COW) mappings.
+		 */
+		VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+		if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+			return -EBUSY;
+		return -ENOENT;
 	}
 	if (!userfaultfd_wp(dst_vma))
 		pte = pte_swp_clear_uffd_wp(pte);
@@ -980,9 +1053,18 @@ again:
 			if (ret == -EIO) {
 				entry = pte_to_swp_entry(*src_pte);
 				break;
+			} else if (ret == -EBUSY) {
+				break;
+			} else if (!ret) {
+				progress += 8;
+				continue;
 			}
-			progress += 8;
-			continue;
+
+			/*
+			 * Device exclusive entry restored, continue by copying
+			 * the now present pte.
+			 */
+			WARN_ON_ONCE(ret != -ENOENT);
 		}
 		/* copy_present_pte() will clear `*prealloc' if consumed */
 		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -1020,6 +1102,8 @@ again:
 			goto out;
 		}
 		entry.val = 0;
+	} else if (ret == -EBUSY) {
+		goto out;
 	} else if (ret ==  -EAGAIN) {
 		prealloc = page_copy_prealloc(src_mm, src_vma, addr);
 		if (!prealloc)
@@ -1287,7 +1371,8 @@ again:
 		}
 
 		entry = pte_to_swp_entry(ptent);
-		if (is_device_private_entry(entry)) {
+		if (is_device_private_entry(entry) ||
+		    is_device_exclusive_entry(entry)) {
 			struct page *page = pfn_swap_entry_to_page(entry);
 
 			if (unlikely(details && details->check_mapping)) {
@@ -1303,7 +1388,10 @@ again:
 
 			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 			rss[mm_counter(page)]--;
-			page_remove_rmap(page, false);
+
+			if (is_device_private_entry(entry))
+				page_remove_rmap(page, false);
+
 			put_page(page);
 			continue;
 		}
@@ -3352,6 +3440,34 @@ void unmap_mapping_range(struct address_space *mapping,
 EXPORT_SYMBOL(unmap_mapping_range);
 
 /*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct vm_area_struct *vma = vmf->vma;
+	struct mmu_notifier_range range;
+
+	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+		return VM_FAULT_RETRY;
+	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+				vma->vm_mm, vmf->address & PAGE_MASK,
+				(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+	mmu_notifier_invalidate_range_start(&range);
+
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+				&vmf->ptl);
+	if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+		restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+	pte_unmap_unlock(vmf->pte, vmf->ptl);
+	unlock_page(page);
+
+	mmu_notifier_invalidate_range_end(&range);
+	return 0;
+}
+
+/*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with pte unmapped and unlocked.
@@ -3379,6 +3495,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		if (is_migration_entry(entry)) {
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
+		} else if (is_device_exclusive_entry(entry)) {
+			vmf->page = pfn_swap_entry_to_page(entry);
+			ret = remove_device_exclusive_entry(vmf);
 		} else if (is_device_private_entry(entry)) {
 			vmf->page = pfn_swap_entry_to_page(entry);
 			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
author	Alistair Popple <apopple@nvidia.com>	2021-06-30 18:54:25 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2021-07-01 11:06:03 -0700
commit	b756a3b5e7ead8f6f4b03cea8ac22478ce04c8a8 (patch)
tree	fc6e3f788e1144e48ddc15475dc18d44448d6e22 /mm/memory.c
parent	9a5cc85c407402ae66128d31f0422a3a7ffa5c5c (diff)