diff options
Diffstat (limited to 'fs/hugetlbfs/inode.c')
-rw-r--r-- | fs/hugetlbfs/inode.c | 184 |
1 files changed, 56 insertions, 128 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6502c7e776d1..e4de5425838d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -39,8 +39,11 @@ #include <linux/uaccess.h> #include <linux/sched/mm.h> +#define CREATE_TRACE_POINTS +#include <trace/events/hugetlbfs.h> + static const struct address_space_operations hugetlbfs_aops; -const struct file_operations hugetlbfs_file_operations; +static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; @@ -73,13 +76,13 @@ enum hugetlb_param { }; static const struct fs_parameter_spec hugetlb_fs_parameters[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_string("min_size", Opt_min_size), fsparam_u32oct("mode", Opt_mode), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("pagesize", Opt_pagesize), fsparam_string("size", Opt_size), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), {} }; @@ -96,7 +99,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -113,10 +115,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; - /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can @@ -171,119 +169,45 @@ out: * Called under mmap_write_lock(mm). */ -static unsigned long -hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; - - info.flags = 0; - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - return vm_unmapped_area(&info); -} - -static unsigned long -hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - info.align_offset = 0; - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (unlikely(offset_in_page(addr))) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - addr = vm_unmapped_area(&info); - } - - return addr; -} - unsigned long -generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + unsigned long addr0 = 0; struct hstate *h = hstate_file(file); - const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len & ~huge_page_mask(h)) return -EINVAL; - if (len > TASK_SIZE) - return -ENOMEM; - if (flags & MAP_FIXED) { + if (addr & ~huge_page_mask(h)) + return -EINVAL; if (prepare_hugepage_range(file, addr, len)) return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (mmap_end - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; } + if (addr) + addr0 = ALIGN(addr, huge_page_size(h)); - /* - * Use mm->get_unmapped_area value as a hint to use topdown routine. - * If architectures have special needs, they should define their own - * version of hugetlb_get_unmapped_area. - */ - if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); + return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, + flags, 0); } -#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -static unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); -} -#endif - /* - * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. + * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw - * HWPOISON subpage. + * HWPOISON page. * * The implementation borrows the iteration logic from copy_page_to_iter*. */ -static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) +static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, + size_t bytes) { + struct page *page; size_t n = 0; size_t res = 0; - /* First subpage to start the loop. */ - page = nth_page(page, offset / PAGE_SIZE); + /* First page to start the loop. */ + page = folio_page(folio, offset / PAGE_SIZE); offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) @@ -356,10 +280,10 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) else { /* * Adjust how many bytes safe to read without - * touching the 1st raw HWPOISON subpage after + * touching the 1st raw HWPOISON page after * offset. */ - want = adjust_range_hwpoison(&folio->page, offset, nr); + want = adjust_range_hwpoison(folio, offset, nr); if (want == 0) { folio_put(folio); retval = -EIO; @@ -390,14 +314,14 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { return -EINVAL; } static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; @@ -416,8 +340,8 @@ static void hugetlb_delete_from_page_cache(struct folio *folio) * mutex for the page in the mapping. So, we can not race with page being * faulted into the vma. */ -static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, - unsigned long addr, struct page *page) +static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn) { pte_t *ptep, pte; @@ -425,11 +349,11 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, if (!ptep) return false; - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (huge_pte_none(pte) || !pte_present(pte)) return false; - if (pte_page(pte) == page) + if (pte_pfn(pte) == pfn) return true; return false; @@ -474,7 +398,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h, { struct rb_root_cached *root = &mapping->i_mmap; struct hugetlb_vma_lock *vma_lock; - struct page *page = &folio->page; + unsigned long pfn = folio_pfn(folio); struct vm_area_struct *vma; unsigned long v_start; unsigned long v_end; @@ -490,7 +414,7 @@ retry: v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (!hugetlb_vma_maps_page(vma, v_start, page)) + if (!hugetlb_vma_maps_pfn(vma, v_start, pfn)) continue; if (!hugetlb_vma_trylock_write(vma)) { @@ -540,7 +464,7 @@ retry: */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (hugetlb_vma_maps_page(vma, v_start, page)) + if (hugetlb_vma_maps_pfn(vma, v_start, pfn)) unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); @@ -689,6 +613,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; + trace_hugetlbfs_evict_inode(inode); remove_inode_hugepages(inode, 0, LLONG_MAX); /* @@ -816,8 +741,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) - return hugetlbfs_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = hugetlbfs_punch_hole(inode, offset, len); + goto out_nolock; + } /* * Default preallocate case. @@ -889,13 +816,13 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); goto out; } - clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + folio_zero_user(folio, addr); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { @@ -921,6 +848,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, inode_set_ctime_current(inode); out: inode_unlock(inode); + +out_nolock: + trace_hugetlbfs_fallocate(inode, mode, offset, len, error); return error; } @@ -937,6 +867,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, if (error) return error; + trace_hugetlbfs_setattr(inode, dentry, attr); + if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; @@ -1035,6 +967,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); + trace_hugetlbfs_alloc_inode(inode, dir, mode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); @@ -1060,14 +993,14 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, return 0; } -static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int hugetlbfs_create(struct mnt_idmap *idmap, @@ -1131,10 +1064,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, hugetlb_set_folio_subpool(src, NULL); } - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); + folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } @@ -1277,6 +1207,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_free_inode(struct inode *inode) { + trace_hugetlbfs_free_inode(inode); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } @@ -1301,13 +1232,14 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -const struct file_operations hugetlbfs_file_operations = { +static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, + .fop_flags = FOP_HUGE_PAGES, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { @@ -1378,15 +1310,11 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par switch (opt) { case Opt_uid: - ctx->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(ctx->uid)) - goto bad_val; + ctx->uid = result.uid; return 0; case Opt_gid: - ctx->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(ctx->gid)) - goto bad_val; + ctx->gid = result.gid; return 0; case Opt_mode: |