diff options
Diffstat (limited to 'fs/hugetlbfs/inode.c')
| -rw-r--r-- | fs/hugetlbfs/inode.c | 172 |
1 files changed, 87 insertions, 85 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a4441fb77f7c..3b4c152c5c73 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,10 +96,16 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) { + /* Unfortunate we have to reassign vma->vm_private_data. */ + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); +} + +static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; struct inode *inode = file_inode(file); - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -113,12 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED); - vma->vm_ops = &hugetlb_vm_ops; - - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + desc->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -127,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + if (desc->pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vma_len = (loff_t)vma_desc_size(desc); + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -146,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; - vm_flags = vma->vm_flags; + vm_flags = desc->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -155,18 +157,31 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (inode->i_flags & S_PRIVATE) vm_flags |= VM_NORESERVE; - if (!hugetlb_reserve_pages(inode, - vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma, - vm_flags)) + if (hugetlb_reserve_pages(inode, + desc->pgoff >> huge_page_order(h), + len >> huge_page_shift(h), desc, + vm_flags) < 0) goto out; ret = 0; - if (vma->vm_flags & VM_WRITE && inode->i_size < len) + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); + if (!ret) { + /* Allocate the VMA lock after we set it up. */ + desc->action.success_hook = hugetlb_file_mmap_prepare_success; + /* + * We cannot permit the rmap finding this VMA in the time + * between the VMA being inserted into the VMA tree and the + * completion/success hook being invoked. + * + * This is because we establish a per-VMA hugetlb lock which can + * be raced by rmap. + */ + desc->action.hide_from_rmap_until_complete = true; + } return ret; } @@ -184,52 +199,37 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; - if (flags & MAP_FIXED) { - if (addr & ~huge_page_mask(h)) - return -EINVAL; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - } + if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h))) + return -EINVAL; if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, - flags, 0); + return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0); } /* - * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. + * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw - * HWPOISON subpage. - * - * The implementation borrows the iteration logic from copy_page_to_iter*. + * HWPOISON page. */ -static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) +static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, + size_t bytes) { - size_t n = 0; - size_t res = 0; + struct page *page = folio_page(folio, offset / PAGE_SIZE); + size_t safe_bytes; - /* First subpage to start the loop. */ - page = nth_page(page, offset / PAGE_SIZE); - offset %= PAGE_SIZE; - while (1) { - if (is_raw_hwpoison_page_in_hugepage(page)) - break; + if (is_raw_hwpoison_page_in_hugepage(page)) + return 0; + /* Safe to read the remaining bytes in this page. */ + safe_bytes = PAGE_SIZE - (offset % PAGE_SIZE); + page++; - /* Safe to read n bytes without touching HWPOISON subpage. */ - n = min(bytes, (size_t)PAGE_SIZE - offset); - res += n; - bytes -= n; - if (!bytes || !n) + /* Check each remaining page as long as we are not done yet. */ + for (; safe_bytes < bytes; safe_bytes += PAGE_SIZE, page++) + if (is_raw_hwpoison_page_in_hugepage(page)) break; - offset += n; - if (offset == PAGE_SIZE) { - page = nth_page(page, 1); - offset = 0; - } - } - return res; + return min(safe_bytes, bytes); } /* @@ -283,10 +283,10 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) else { /* * Adjust how many bytes safe to read without - * touching the 1st raw HWPOISON subpage after + * touching the 1st raw HWPOISON page after * offset. */ - want = adjust_range_hwpoison(&folio->page, offset, nr); + want = adjust_range_hwpoison(folio, offset, nr); if (want == 0) { folio_put(folio); retval = -EIO; @@ -314,7 +314,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval; } -static int hugetlbfs_write_begin(struct file *file, +static int hugetlbfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -322,9 +322,10 @@ static int hugetlbfs_write_begin(struct file *file, return -EINVAL; } -static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int hugetlbfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; @@ -343,8 +344,8 @@ static void hugetlb_delete_from_page_cache(struct folio *folio) * mutex for the page in the mapping. So, we can not race with page being * faulted into the vma. */ -static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, - unsigned long addr, struct page *page) +static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn) { pte_t *ptep, pte; @@ -356,7 +357,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, if (huge_pte_none(pte) || !pte_present(pte)) return false; - if (pte_page(pte) == page) + if (pte_pfn(pte) == pfn) return true; return false; @@ -401,7 +402,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h, { struct rb_root_cached *root = &mapping->i_mmap; struct hugetlb_vma_lock *vma_lock; - struct page *page = &folio->page; + unsigned long pfn = folio_pfn(folio); struct vm_area_struct *vma; unsigned long v_start; unsigned long v_end; @@ -417,7 +418,7 @@ retry: v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (!hugetlb_vma_maps_page(vma, v_start, page)) + if (!hugetlb_vma_maps_pfn(vma, v_start, pfn)) continue; if (!hugetlb_vma_trylock_write(vma)) { @@ -467,7 +468,7 @@ retry: */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (hugetlb_vma_maps_page(vma, v_start, page)) + if (hugetlb_vma_maps_pfn(vma, v_start, pfn)) unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); @@ -523,14 +524,16 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, /* * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) while holding - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. + * unmapped in caller or hugetlb_vmdelete_list() skips + * unmapping it due to fail to grab lock. Unmap (again) + * while holding the fault mutex. The mutex will prevent + * faults until we finish removing the folio. Hold folio + * lock to guarantee no concurrent migration. */ + folio_lock(folio); if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); - folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In @@ -819,13 +822,13 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); goto out; } - folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size)); + folio_zero_user(folio, addr); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { @@ -991,19 +994,18 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - d_instantiate(dentry, inode); - dget(dentry);/* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); return 0; } -static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int hugetlbfs_create(struct mnt_idmap *idmap, @@ -1039,10 +1041,9 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); - if (!error) { - d_instantiate(dentry, inode); - dget(dentry); - } else + if (!error) + d_make_persistent(dentry, inode); + else iput(inode); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1058,7 +1059,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, dst, src); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (hugetlb_folio_subpool(src)) { @@ -1069,7 +1070,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } #else #define hugetlbfs_migrate_folio NULL @@ -1237,7 +1238,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap = hugetlbfs_file_mmap, + .mmap_prepare = hugetlbfs_file_mmap_prepare, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, @@ -1436,6 +1437,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; + sb->s_d_flags = DCACHE_DONTCACHE; sb->s_time_gran = 1; /* @@ -1498,7 +1500,7 @@ static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, .fs_flags = FS_ALLOW_IDMAP, }; @@ -1564,9 +1566,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - if (!hugetlb_reserve_pages(inode, 0, + if (hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, - acctflag)) + acctflag) < 0) file = ERR_PTR(-ENOMEM); else file = alloc_file_pseudo(inode, mnt, name, O_RDWR, @@ -1590,7 +1592,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) } else { struct hugetlbfs_fs_context *ctx = fc->fs_private; ctx->hstate = h; - mnt = fc_mount(fc); + mnt = fc_mount_longterm(fc); put_fs_context(fc); } if (IS_ERR(mnt)) |
