summaryrefslogtreecommitdiff
path: root/fs/hugetlbfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/hugetlbfs/inode.c')
-rw-r--r--fs/hugetlbfs/inode.c1047
1 files changed, 562 insertions, 485 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ef5313f9c78f..3b4c152c5c73 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -11,7 +11,6 @@
#include <linux/thread_info.h>
#include <asm/current.h>
-#include <linux/sched/signal.h> /* remove ASAP */
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -40,9 +39,11 @@
#include <linux/uaccess.h>
#include <linux/sched/mm.h>
-static const struct super_operations hugetlbfs_ops;
+#define CREATE_TRACE_POINTS
+#include <trace/events/hugetlbfs.h>
+
static const struct address_space_operations hugetlbfs_aops;
-const struct file_operations hugetlbfs_file_operations;
+static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
@@ -75,49 +76,16 @@ enum hugetlb_param {
};
static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_string("min_size", Opt_min_size),
- fsparam_u32 ("mode", Opt_mode),
+ fsparam_u32oct("mode", Opt_mode),
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("pagesize", Opt_pagesize),
fsparam_string("size", Opt_size),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
-#ifdef CONFIG_NUMA
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
-{
- vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
- index);
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
- mpol_cond_put(vma->vm_policy);
-}
-#else
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
-{
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
-}
-#endif
-
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
@@ -128,23 +96,31 @@ static void huge_pagevec_release(struct pagevec *pvec)
#define PGOFF_LOFFT_MAX \
(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
-static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
{
+ /* Unfortunate we have to reassign vma->vm_private_data. */
+ return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
+}
+
+static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
+ vm_flags_t vm_flags;
/*
* vma address alignment (but not the pgoff alignment) has
* already been checked by prepare_hugepage_range. If you add
* any error returns here, do so after setting VM_HUGETLB, so
* is_vm_hugetlb_page tests below unmap_region go the right
- * way when do_mmap_pgoff unwinds (may be important on powerpc
+ * way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
- vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
- vma->vm_ops = &hugetlb_vm_ops;
+ desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+ desc->vm_ops = &hugetlb_vm_ops;
/*
* page based offset in vm_pgoff could be sufficiently large to
@@ -153,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* sizeof(unsigned long). So, only check in those instances.
*/
if (sizeof(unsigned long) == sizeof(loff_t)) {
- if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+ if (desc->pgoff & PGOFF_LOFFT_MAX)
return -EINVAL;
}
/* must be huge page aligned */
- if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+ if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
- vma_len = (loff_t)(vma->vm_end - vma->vm_start);
- len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ vma_len = (loff_t)vma_desc_size(desc);
+ len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
/* check for overflow */
if (len < vma_len)
return -EINVAL;
@@ -171,18 +147,41 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
ret = -ENOMEM;
+
+ vm_flags = desc->vm_flags;
+ /*
+ * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
+ * reserving here. Note: only for SHM hugetlbfs file, the inode
+ * flag S_PRIVATE is set.
+ */
+ if (inode->i_flags & S_PRIVATE)
+ vm_flags |= VM_NORESERVE;
+
if (hugetlb_reserve_pages(inode,
- vma->vm_pgoff >> huge_page_order(h),
- len >> huge_page_shift(h), vma,
- vma->vm_flags))
+ desc->pgoff >> huge_page_order(h),
+ len >> huge_page_shift(h), desc,
+ vm_flags) < 0)
goto out;
ret = 0;
- if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+ if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
i_size_write(inode, len);
out:
inode_unlock(inode);
+ if (!ret) {
+ /* Allocate the VMA lock after we set it up. */
+ desc->action.success_hook = hugetlb_file_mmap_prepare_success;
+ /*
+ * We cannot permit the rmap finding this VMA in the time
+ * between the VMA being inserted into the VMA tree and the
+ * completion/success hook being invoked.
+ *
+ * This is because we establish a per-VMA hugetlb lock which can
+ * be raced by rmap.
+ */
+ desc->action.hide_from_rmap_until_complete = true;
+ }
return ret;
}
@@ -190,128 +189,52 @@ out:
* Called under mmap_write_lock(mm).
*/
-#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-static unsigned long
-hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
-
- info.flags = 0;
- info.length = len;
- info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- return vm_unmapped_area(&info);
-}
-
-static unsigned long
-hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
-
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = current->mm->mmap_base;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- addr = vm_unmapped_area(&info);
-
- /*
- * A failed mmap() very likely causes application failure,
- * so fall back to the bottom-up function here. This scenario
- * can happen with large stack limits and large mmap()
- * allocations.
- */
- if (unlikely(offset_in_page(addr))) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
-}
-
-static unsigned long
+unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
+ unsigned long addr0 = 0;
struct hstate *h = hstate_file(file);
if (len & ~huge_page_mask(h))
return -EINVAL;
- if (len > TASK_SIZE)
- return -ENOMEM;
-
- if (flags & MAP_FIXED) {
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- return addr;
- }
-
- if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
- return addr;
- }
+ if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h)))
+ return -EINVAL;
+ if (addr)
+ addr0 = ALIGN(addr, huge_page_size(h));
- /*
- * Use mm->get_unmapped_area value as a hint to use topdown routine.
- * If architectures have special needs, they should define their own
- * version of hugetlb_get_unmapped_area.
- */
- if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
- return hugetlb_get_unmapped_area_topdown(file, addr, len,
- pgoff, flags);
- return hugetlb_get_unmapped_area_bottomup(file, addr, len,
- pgoff, flags);
+ return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0);
}
-#endif
-static size_t
-hugetlbfs_read_actor(struct page *page, unsigned long offset,
- struct iov_iter *to, unsigned long size)
+/*
+ * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset.
+ * Returns the maximum number of bytes one can read without touching the 1st raw
+ * HWPOISON page.
+ */
+static size_t adjust_range_hwpoison(struct folio *folio, size_t offset,
+ size_t bytes)
{
- size_t copied = 0;
- int i, chunksize;
-
- /* Find which 4k chunk and offset with in that chunk */
- i = offset >> PAGE_SHIFT;
- offset = offset & ~PAGE_MASK;
-
- while (size) {
- size_t n;
- chunksize = PAGE_SIZE;
- if (offset)
- chunksize -= offset;
- if (chunksize > size)
- chunksize = size;
- n = copy_page_to_iter(&page[i], offset, chunksize, to);
- copied += n;
- if (n != chunksize)
- return copied;
- offset = 0;
- size -= chunksize;
- i++;
- }
- return copied;
+ struct page *page = folio_page(folio, offset / PAGE_SIZE);
+ size_t safe_bytes;
+
+ if (is_raw_hwpoison_page_in_hugepage(page))
+ return 0;
+ /* Safe to read the remaining bytes in this page. */
+ safe_bytes = PAGE_SIZE - (offset % PAGE_SIZE);
+ page++;
+
+ /* Check each remaining page as long as we are not done yet. */
+ for (; safe_bytes < bytes; safe_bytes += PAGE_SIZE, page++)
+ if (is_raw_hwpoison_page_in_hugepage(page))
+ break;
+
+ return min(safe_bytes, bytes);
}
/*
* Support for read() - Find the page attached to f_mapping and copy out the
- * data. Its *very* similar to do_generic_mapping_read(), we can't use that
- * since it has PAGE_SIZE assumptions.
+ * data. This provides functionality similar to filemap_read().
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -326,8 +249,8 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
ssize_t retval = 0;
while (iov_iter_count(to)) {
- struct page *page;
- size_t nr, copied;
+ struct folio *folio;
+ size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
@@ -344,22 +267,38 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
nr = nr - offset;
- /* Find the page */
- page = find_lock_page(mapping, index);
- if (unlikely(page == NULL)) {
+ /* Find the folio */
+ folio = filemap_lock_hugetlb_folio(h, mapping, index);
+ if (IS_ERR(folio)) {
/*
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
*/
copied = iov_iter_zero(nr, to);
} else {
- unlock_page(page);
+ folio_unlock(folio);
+
+ if (!folio_test_hwpoison(folio))
+ want = nr;
+ else {
+ /*
+ * Adjust how many bytes safe to read without
+ * touching the 1st raw HWPOISON page after
+ * offset.
+ */
+ want = adjust_range_hwpoison(folio, offset, nr);
+ if (want == 0) {
+ folio_put(folio);
+ retval = -EIO;
+ break;
+ }
+ }
/*
- * We have the page, copy it to user space buffer.
+ * We have the folio, copy it to user space buffer.
*/
- copied = hugetlbfs_read_actor(page, offset, to, nr);
- put_page(page);
+ copied = copy_folio_to_iter(folio, offset, want, to);
+ folio_put(folio);
}
offset += copied;
retval += copied;
@@ -375,65 +314,244 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval;
}
-static int hugetlbfs_write_begin(struct file *file,
+static int hugetlbfs_write_begin(const struct kiocb *iocb,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
return -EINVAL;
}
-static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static int hugetlbfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
BUG();
return -EINVAL;
}
-static void remove_huge_page(struct page *page)
+static void hugetlb_delete_from_page_cache(struct folio *folio)
{
- ClearPageDirty(page);
- ClearPageUptodate(page);
- delete_from_page_cache(page);
+ folio_clear_dirty(folio);
+ folio_clear_uptodate(folio);
+ filemap_remove_folio(folio);
+}
+
+/*
+ * Called with i_mmap_rwsem held for inode based vma maps. This makes
+ * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault
+ * mutex for the page in the mapping. So, we can not race with page being
+ * faulted into the vma.
+ */
+static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn)
+{
+ pte_t *ptep, pte;
+
+ ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
+ if (!ptep)
+ return false;
+
+ pte = huge_ptep_get(vma->vm_mm, addr, ptep);
+ if (huge_pte_none(pte) || !pte_present(pte))
+ return false;
+
+ if (pte_pfn(pte) == pfn)
+ return true;
+
+ return false;
+}
+
+/*
+ * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
+{
+ unsigned long offset = 0;
+
+ if (vma->vm_pgoff < start)
+ offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+
+ return vma->vm_start + offset;
+}
+
+static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
+{
+ unsigned long t_end;
+
+ if (!end)
+ return vma->vm_end;
+
+ t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
+ if (t_end > vma->vm_end)
+ t_end = vma->vm_end;
+ return t_end;
+}
+
+/*
+ * Called with hugetlb fault mutex held. Therefore, no more mappings to
+ * this folio can be created while executing the routine.
+ */
+static void hugetlb_unmap_file_folio(struct hstate *h,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index)
+{
+ struct rb_root_cached *root = &mapping->i_mmap;
+ struct hugetlb_vma_lock *vma_lock;
+ unsigned long pfn = folio_pfn(folio);
+ struct vm_area_struct *vma;
+ unsigned long v_start;
+ unsigned long v_end;
+ pgoff_t start, end;
+
+ start = index * pages_per_huge_page(h);
+ end = (index + 1) * pages_per_huge_page(h);
+
+ i_mmap_lock_write(mapping);
+retry:
+ vma_lock = NULL;
+ vma_interval_tree_foreach(vma, root, start, end - 1) {
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ if (!hugetlb_vma_maps_pfn(vma, v_start, pfn))
+ continue;
+
+ if (!hugetlb_vma_trylock_write(vma)) {
+ vma_lock = vma->vm_private_data;
+ /*
+ * If we can not get vma lock, we need to drop
+ * immap_sema and take locks in order. First,
+ * take a ref on the vma_lock structure so that
+ * we can be guaranteed it will not go away when
+ * dropping immap_sema.
+ */
+ kref_get(&vma_lock->refs);
+ break;
+ }
+
+ unmap_hugepage_range(vma, v_start, v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
+ hugetlb_vma_unlock_write(vma);
+ }
+
+ i_mmap_unlock_write(mapping);
+
+ if (vma_lock) {
+ /*
+ * Wait on vma_lock. We know it is still valid as we have
+ * a reference. We must 'open code' vma locking as we do
+ * not know if vma_lock is still attached to vma.
+ */
+ down_write(&vma_lock->rw_sema);
+ i_mmap_lock_write(mapping);
+
+ vma = vma_lock->vma;
+ if (!vma) {
+ /*
+ * If lock is no longer attached to vma, then just
+ * unlock, drop our reference and retry looking for
+ * other vmas.
+ */
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ goto retry;
+ }
+
+ /*
+ * vma_lock is still attached to vma. Check to see if vma
+ * still maps page and if so, unmap.
+ */
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+ if (hugetlb_vma_maps_pfn(vma, v_start, pfn))
+ unmap_hugepage_range(vma, v_start, v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
+
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ hugetlb_vma_unlock_write(vma);
+
+ goto retry;
+ }
}
static void
-hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
+ zap_flags_t zap_flags)
{
struct vm_area_struct *vma;
/*
- * end == 0 indicates that the entire range after
- * start should be unmapped.
+ * end == 0 indicates that the entire range after start should be
+ * unmapped. Note, end is exclusive, whereas the interval tree takes
+ * an inclusive "last".
*/
- vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
- unsigned long v_offset;
+ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
+ unsigned long v_start;
unsigned long v_end;
+ if (!hugetlb_vma_trylock_write(vma))
+ continue;
+
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);
+
/*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
+ * Note that vma lock only exists for shared/non-private
+ * vmas. Therefore, lock is not held when calling
+ * unmap_hugepage_range for private vmas.
*/
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (!end)
- v_end = vma->vm_end;
- else {
- v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
- + vma->vm_start;
- if (v_end > vma->vm_end)
- v_end = vma->vm_end;
- }
+ hugetlb_vma_unlock_write(vma);
+ }
+}
+
+/*
+ * Called with hugetlb fault mutex held.
+ * Returns true if page was actually removed, false otherwise.
+ */
+static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index,
+ bool truncate_op)
+{
+ bool ret = false;
- unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
- NULL);
+ /*
+ * If folio is mapped, it was faulted in after being
+ * unmapped in caller or hugetlb_vmdelete_list() skips
+ * unmapping it due to fail to grab lock. Unmap (again)
+ * while holding the fault mutex. The mutex will prevent
+ * faults until we finish removing the folio. Hold folio
+ * lock to guarantee no concurrent migration.
+ */
+ folio_lock(folio);
+ if (unlikely(folio_mapped(folio)))
+ hugetlb_unmap_file_folio(h, mapping, folio, index);
+
+ /*
+ * We must remove the folio from page cache before removing
+ * the region/ reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the region/reserve
+ * map could fail. Correspondingly, the subpool and global
+ * reserve usage count can need to be adjusted.
+ */
+ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
+ hugetlb_delete_from_page_cache(folio);
+ ret = true;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode, index,
+ index + 1, 1)))
+ hugetlb_fix_reserve_counts(inode);
}
+
+ folio_unlock(folio);
+ return ret;
}
/*
@@ -442,15 +560,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
*
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
- * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- * maps and global counts. Page faults can not race with truncation
- * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents
- * page faults in the truncated range by checking i_size. i_size is
- * modified while holding i_mmap_rwsem.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
+ * maps and global counts. Page faults can race with truncation.
+ * During faults, hugetlb_no_page() checks i_size before page allocation,
+ * and again after obtaining page table lock. It will 'back out'
+ * allocations in the truncated range.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
- * Only when releasing a page is the associated region/reserv map
- * deleted. The region/reserv map for ranges without associated
+ * Only when releasing a page is the associated region/reserve map
+ * deleted. The region/reserve map for ranges without associated
* pages are not modified. Page faults can race with hole punch.
* This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
@@ -461,97 +579,47 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
- const pgoff_t start = lstart >> huge_page_shift(h);
- const pgoff_t end = lend >> huge_page_shift(h);
- struct vm_area_struct pseudo_vma;
- struct pagevec pvec;
+ const pgoff_t end = lend >> PAGE_SHIFT;
+ struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
- vma_init(&pseudo_vma, current->mm);
- pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
- pagevec_init(&pvec);
- next = start;
- while (next < end) {
- /*
- * When no more pages are found, we are done.
- */
- if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); ++i) {
- struct page *page = pvec.pages[i];
- u32 hash;
+ folio_batch_init(&fbatch);
+ next = lstart >> PAGE_SHIFT;
+ while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ struct folio *folio = fbatch.folios[i];
+ u32 hash = 0;
- index = page->index;
+ index = folio->index >> huge_page_order(h);
hash = hugetlb_fault_mutex_hash(mapping, index);
- if (!truncate_op) {
- /*
- * Only need to hold the fault mutex in the
- * hole punch case. This prevents races with
- * page faults. Races are not possible in the
- * case of truncation.
- */
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- }
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
- * If page is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) now after taking
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the page.
- *
- * This race can only happen in the hole punch case.
- * Getting here in a truncate operation is a bug.
+ * Remove folio that was part of folio_batch.
*/
- if (unlikely(page_mapped(page))) {
- BUG_ON(truncate_op);
-
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_lock_write(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- hugetlb_vmdelete_list(&mapping->i_mmap,
- index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h));
- i_mmap_unlock_write(mapping);
- }
+ if (remove_inode_single_folio(h, inode, mapping, folio,
+ index, truncate_op))
+ freed++;
- lock_page(page);
- /*
- * We must free the huge page and remove from page
- * cache (remove_huge_page) BEFORE removing the
- * region/reserve map (hugetlb_unreserve_pages). In
- * rare out of memory conditions, removal of the
- * region/reserve map could fail. Correspondingly,
- * the subpool and global reserve usage count can need
- * to be adjusted.
- */
- VM_BUG_ON(PagePrivate(page));
- remove_huge_page(page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(inode,
- index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
- }
-
- unlock_page(page);
- if (!truncate_op)
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
- huge_pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
if (truncate_op)
- (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+ (void)hugetlb_unreserve_pages(inode,
+ lstart >> huge_page_shift(h),
+ LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
+ trace_hugetlbfs_evict_inode(inode);
remove_inode_hugepages(inode, 0, LLONG_MAX);
/*
@@ -560,14 +628,14 @@ static void hugetlbfs_evict_inode(struct inode *inode)
* at inode creation time. If this is a device special inode,
* i_mapping may not point to the original address space.
*/
- resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+ resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
/* Only regular and link inodes have associated reserve maps */
if (resv_map)
resv_map_release(&resv_map->refs);
clear_inode(inode);
}
-static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
+static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
@@ -576,50 +644,88 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;
- i_mmap_lock_write(mapping);
i_size_write(inode, offset);
+ i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
- return 0;
+}
+
+static void hugetlbfs_zero_partial_page(struct hstate *h,
+ struct address_space *mapping,
+ loff_t start,
+ loff_t end)
+{
+ pgoff_t idx = start >> huge_page_shift(h);
+ struct folio *folio;
+
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
+ if (IS_ERR(folio))
+ return;
+
+ start = start & ~huge_page_mask(h);
+ end = end & ~huge_page_mask(h);
+ if (!end)
+ end = huge_page_size(h);
+
+ folio_zero_segment(folio, (size_t)start, (size_t)end);
+
+ folio_unlock(folio);
+ folio_put(folio);
}
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
loff_t hpage_size = huge_page_size(h);
loff_t hole_start, hole_end;
/*
- * For hole punch round up the beginning offset of the hole and
- * round down the end.
+ * hole_start and hole_end indicate the full pages within the hole.
*/
hole_start = round_up(offset, hpage_size);
hole_end = round_down(offset + len, hpage_size);
- if (hole_end > hole_start) {
- struct address_space *mapping = inode->i_mapping;
- struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ inode_lock(inode);
- inode_lock(inode);
+ /* protected by i_rwsem */
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ inode_unlock(inode);
+ return -EPERM;
+ }
- /* protected by i_mutex */
- if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
- inode_unlock(inode);
- return -EPERM;
- }
+ i_mmap_lock_write(mapping);
- i_mmap_lock_write(mapping);
+ /* If range starts before first full page, zero partial page. */
+ if (offset < hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ offset, min(offset + len, hole_start));
+
+ /* Unmap users of full pages in the hole. */
+ if (hole_end > hole_start) {
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
- inode_unlock(inode);
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT, 0);
}
+ /* If range extends beyond last full page, zero partial page. */
+ if ((offset + len) > hole_end && (offset + len) > hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ hole_end, offset + len);
+
+ i_mmap_unlock_write(mapping);
+
+ /* Remove full pages from the file. */
+ if (hole_end > hole_start)
+ remove_inode_hugepages(inode, hole_start, hole_end);
+
+ inode_unlock(inode);
+
return 0;
}
@@ -641,8 +747,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return hugetlbfs_punch_hole(inode, offset, len);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = hugetlbfs_punch_hole(inode, offset, len);
+ goto out_nolock;
+ }
/*
* Default preallocate case.
@@ -666,11 +774,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
/*
* Initialize a pseudo vma as this is required by the huge page
- * allocation routines. If NUMA is configured, use page index
- * as input to create an allocation policy.
+ * allocation routines.
*/
vma_init(&pseudo_vma, mm);
- pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pseudo_vma.vm_file = file;
for (index = start; index < end; index++) {
@@ -678,9 +785,8 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
* This is supposed to be the vaddr where the page is being
* faulted in, but we have no vaddr here.
*/
- struct page *page;
+ struct folio *folio;
unsigned long addr;
- int avoid_reserve = 0;
cond_resched();
@@ -693,65 +799,69 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
break;
}
- /* Set numa allocation policy based on index */
- hugetlb_set_vma_policy(&pseudo_vma, inode, index);
-
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
- /*
- * fault mutex taken here, protects against fault path
- * and hole punch. inode_lock previously taken protects
- * against truncation.
- */
+ /* mutex taken here, fault path and hole punch */
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
- page = find_get_page(mapping, index);
- if (page) {
- put_page(page);
+ folio = filemap_get_folio(mapping, index << huge_page_order(h));
+ if (!IS_ERR(folio)) {
+ folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- hugetlb_drop_vma_policy(&pseudo_vma);
continue;
}
- /* Allocate page and add to page cache */
- page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
- hugetlb_drop_vma_policy(&pseudo_vma);
- if (IS_ERR(page)) {
+ /*
+ * Allocate folio without setting the avoid_reserve argument.
+ * There certainly are no reserves associated with the
+ * pseudo_vma. However, there could be shared mappings with
+ * reserves for the file at the inode level. If we fallocate
+ * folios in these areas, we need to consume the reserves
+ * to keep reservation accounting consistent.
+ */
+ folio = alloc_hugetlb_folio(&pseudo_vma, addr, false);
+ if (IS_ERR(folio)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- error = PTR_ERR(page);
+ error = PTR_ERR(folio);
goto out;
}
- clear_huge_page(page, addr, pages_per_huge_page(h));
- __SetPageUptodate(page);
- error = huge_add_to_page_cache(page, mapping, index);
+ folio_zero_user(folio, addr);
+ __folio_mark_uptodate(folio);
+ error = hugetlb_add_to_page_cache(folio, mapping, index);
if (unlikely(error)) {
- put_page(page);
+ restore_reserve_on_error(h, &pseudo_vma, addr, folio);
+ folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ folio_set_hugetlb_migratable(folio);
/*
- * unlock_page because locked by add_to_page_cache()
- * page_put due to reference from alloc_huge_page()
+ * folio_unlock because locked by hugetlb_add_to_page_cache()
+ * folio_put() due to reference from alloc_hugetlb_folio()
*/
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
out:
inode_unlock(inode);
+
+out_nolock:
+ trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
return error;
}
-static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hugetlbfs_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct hstate *h = hstate_inode(inode);
@@ -759,28 +869,26 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
unsigned int ia_valid = attr->ia_valid;
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
- BUG_ON(!inode);
-
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
if (error)
return error;
+ trace_hugetlbfs_setattr(inode, dentry, attr);
+
if (ia_valid & ATTR_SIZE) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
if (newsize & ~huge_page_mask(h))
return -EINVAL;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
- error = hugetlb_vmtruncate(inode, newsize);
- if (error)
- return error;
+ hugetlb_vmtruncate(inode, newsize);
}
- setattr_copy(inode, attr);
+ setattr_copy(idmap, inode, attr);
mark_inode_dirty(inode);
return 0;
}
@@ -796,7 +904,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
inode->i_mode = S_IFDIR | ctx->mode;
inode->i_uid = ctx->uid;
inode->i_gid = ctx->gid;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -815,6 +923,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+ struct mnt_idmap *idmap,
struct inode *dir,
umode_t mode, dev_t dev)
{
@@ -836,12 +945,12 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
inode->i_ino = get_next_ino();
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(idmap, inode, dir, mode);
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_mapping->private_data = resv_map;
+ simple_inode_init_ts(inode);
+ inode->i_mapping->i_private_data = resv_map;
info->seals = F_SEAL_SEAL;
switch (mode & S_IFMT) {
default:
@@ -864,6 +973,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
+ trace_hugetlbfs_alloc_inode(inode, dir, mode);
} else {
if (resv_map)
kref_put(&resv_map->refs, resv_map_release);
@@ -875,125 +985,100 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* File creation. Allocate an inode, and we're done..
*/
-static int do_hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- dev_t dev,
- bool tmpfile)
+static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
- int error = -ENOSPC;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
- if (inode) {
- dir->i_ctime = dir->i_mtime = current_time(dir);
- if (tmpfile) {
- d_tmpfile(dentry, inode);
- } else {
- d_instantiate(dentry, inode);
- dget(dentry);/* Extra count - pin the dentry in core */
- }
- error = 0;
- }
- return error;
-}
-
-static int hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
-{
- return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
+ if (!inode)
+ return -ENOSPC;
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+ d_make_persistent(dentry, inode);
+ return 0;
}
-static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+ int retval = hugetlbfs_mknod(idmap, dir, dentry,
+ mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
- return retval;
+ return ERR_PTR(retval);
}
-static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int hugetlbfs_create(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
{
- return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+ return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}
-static int hugetlbfs_tmpfile(struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
+ struct inode *dir, struct file *file,
+ umode_t mode)
{
- return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+ struct inode *inode;
+
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
+ if (!inode)
+ return -ENOSPC;
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+ d_tmpfile(file, inode);
+ return finish_open_simple(file, 0);
}
-static int hugetlbfs_symlink(struct inode *dir,
- struct dentry *dentry, const char *symname)
+static int hugetlbfs_symlink(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ const char *symname)
{
+ const umode_t mode = S_IFLNK|S_IRWXUGO;
struct inode *inode;
int error = -ENOSPC;
- inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+ inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
if (inode) {
int l = strlen(symname)+1;
error = page_symlink(inode, symname, l);
- if (!error) {
- d_instantiate(dentry, inode);
- dget(dentry);
- } else
+ if (!error)
+ d_make_persistent(dentry, inode);
+ else
iput(inode);
}
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
return error;
}
-/*
- * mark the head page dirty
- */
-static int hugetlbfs_set_page_dirty(struct page *page)
-{
- struct page *head = compound_head(page);
-
- SetPageDirty(head);
- return 0;
-}
-
-static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page,
+#ifdef CONFIG_MIGRATION
+static int hugetlbfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
int rc;
- rc = migrate_huge_page_move_mapping(mapping, newpage, page);
- if (rc != MIGRATEPAGE_SUCCESS)
+ rc = migrate_huge_page_move_mapping(mapping, dst, src);
+ if (rc)
return rc;
- /*
- * page_private is subpool pointer in hugetlb pages. Transfer to
- * new page. PagePrivate is not associated with page_private for
- * hugetlb pages and can not be set here as only page_huge_active
- * pages can be migrated.
- */
- if (page_private(page)) {
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
+ if (hugetlb_folio_subpool(src)) {
+ hugetlb_set_folio_subpool(dst,
+ hugetlb_folio_subpool(src));
+ hugetlb_set_folio_subpool(src, NULL);
}
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return 0;
}
+#else
+#define hugetlbfs_migrate_folio NULL
+#endif
-static int hugetlbfs_error_remove_page(struct address_space *mapping,
- struct page *page)
+static int hugetlbfs_error_remove_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- remove_huge_page(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
@@ -1041,22 +1126,24 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
struct hstate *h = hstate_inode(d_inode(dentry));
+ u64 id = huge_encode_dev(dentry->d_sb->s_dev);
+ buf->f_fsid = u64_to_fsid(id);
buf->f_type = HUGETLBFS_MAGIC;
buf->f_bsize = huge_page_size(h);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock);
- /* If no limits set, just report 0 for max/free/used
+ /* If no limits set, just report 0 or -1 for max/free/used
* blocks, like simple_statfs() */
if (sbinfo->spool) {
long free_pages;
- spin_lock(&sbinfo->spool->lock);
+ spin_lock_irq(&sbinfo->spool->lock);
buf->f_blocks = sbinfo->spool->max_hpages;
free_pages = sbinfo->spool->max_hpages
- sbinfo->spool->used_hpages;
buf->f_bavail = buf->f_bfree = free_pages;
- spin_unlock(&sbinfo->spool->lock);
+ spin_unlock_irq(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
@@ -1114,60 +1201,49 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
return NULL;
- p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
+ p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
if (unlikely(!p)) {
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
}
-
- /*
- * Any time after allocation, hugetlbfs_destroy_inode can be called
- * for the inode. mpol_free_shared_policy is unconditionally called
- * as part of hugetlbfs_destroy_inode. So, initialize policy here
- * in case of a quick call to destroy.
- *
- * Note that the policy is initialized even if we are creating a
- * private inode. This simplifies hugetlbfs_destroy_inode.
- */
- mpol_shared_policy_init(&p->policy, NULL);
-
return &p->vfs_inode;
}
static void hugetlbfs_free_inode(struct inode *inode)
{
+ trace_hugetlbfs_free_inode(inode);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
static void hugetlbfs_destroy_inode(struct inode *inode)
{
hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
- mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
}
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
- .set_page_dirty = hugetlbfs_set_page_dirty,
- .migratepage = hugetlbfs_migrate_page,
- .error_remove_page = hugetlbfs_error_remove_page,
+ .dirty_folio = noop_dirty_folio,
+ .migrate_folio = hugetlbfs_migrate_folio,
+ .error_remove_folio = hugetlbfs_error_remove_folio,
};
static void init_once(void *foo)
{
- struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+ struct hugetlbfs_inode_info *ei = foo;
inode_init_once(&ei->vfs_inode);
}
-const struct file_operations hugetlbfs_file_operations = {
+static const struct file_operations hugetlbfs_file_operations = {
.read_iter = hugetlbfs_read_iter,
- .mmap = hugetlbfs_file_mmap,
+ .mmap_prepare = hugetlbfs_file_mmap_prepare,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
.llseek = default_llseek,
.fallocate = hugetlbfs_fallocate,
+ .fop_flags = FOP_HUGE_PAGES,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -1227,6 +1303,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
{
struct hugetlbfs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
+ struct hstate *h;
char *rest;
unsigned long ps;
int opt;
@@ -1237,15 +1314,11 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
switch (opt) {
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
- goto bad_val;
+ ctx->uid = result.uid;
return 0;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
- goto bad_val;
+ ctx->gid = result.gid;
return 0;
case Opt_mode:
@@ -1254,7 +1327,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->max_size_opt = memparse(param->string, &rest);
ctx->max_val_type = SIZE_STD;
@@ -1264,23 +1337,24 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_nr_inodes:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->nr_inodes = memparse(param->string, &rest);
return 0;
case Opt_pagesize:
ps = memparse(param->string, &rest);
- ctx->hstate = size_to_hstate(ps);
- if (!ctx->hstate) {
- pr_err("Unsupported page size %lu MB\n", ps >> 20);
+ h = size_to_hstate(ps);
+ if (!h) {
+ pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
return -EINVAL;
}
+ ctx->hstate = h;
return 0;
case Opt_min_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->min_size_opt = memparse(param->string, &rest);
ctx->min_val_type = SIZE_STD;
@@ -1348,8 +1422,8 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
/*
* Allocate and initialize subpool if maximum or minimum size is
- * specified. Any needed reservations (for minimim size) are taken
- * taken when the subpool is created.
+ * specified. Any needed reservations (for minimum size) are taken
+ * when the subpool is created.
*/
if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
sbinfo->spool = hugepage_new_subpool(ctx->hstate,
@@ -1363,7 +1437,14 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_time_gran = 1;
+
+ /*
+ * Due to the special and limited functionality of hugetlbfs, it does
+ * not work well as a stacking filesystem.
+ */
+ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
if (!sb->s_root)
goto out_free;
@@ -1419,7 +1500,8 @@ static struct file_system_type hugetlbfs_fs_type = {
.name = "hugetlbfs",
.init_fs_context = hugetlbfs_init_fs_context,
.parameters = hugetlb_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
+ .fs_flags = FS_ALLOW_IDMAP,
};
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1437,7 +1519,7 @@ static int get_hstate_idx(int page_size_log)
if (!h)
return -1;
- return h - hstates;
+ return hstate_index(h);
}
/*
@@ -1445,8 +1527,8 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, struct user_struct **user,
- int creat_flags, int page_size_log)
+ vm_flags_t acctflag, int creat_flags,
+ int page_size_log)
{
struct inode *inode;
struct vfsmount *mnt;
@@ -1457,26 +1539,25 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);
- *user = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt)
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *user = current_user();
- if (user_shm_lock(size, *user)) {
- task_lock(current);
- pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ struct ucounts *ucounts = current_ucounts();
+
+ if (user_shm_lock(size, ucounts)) {
+ pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
current->comm, current->pid);
- task_unlock(current);
- } else {
- *user = NULL;
- return ERR_PTR(-EPERM);
+ user_shm_unlock(size, ucounts);
}
+ return ERR_PTR(-EPERM);
}
file = ERR_PTR(-ENOSPC);
- inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+ /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */
+ inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
+ S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out;
if (creat_flags == HUGETLB_SHMFS_INODE)
@@ -1487,7 +1568,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL,
- acctflag))
+ acctflag) < 0)
file = ERR_PTR(-ENOMEM);
else
file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
@@ -1497,10 +1578,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
iput(inode);
out:
- if (*user) {
- user_shm_unlock(size, *user);
- *user = NULL;
- }
return file;
}
@@ -1515,12 +1592,12 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
} else {
struct hugetlbfs_fs_context *ctx = fc->fs_private;
ctx->hstate = h;
- mnt = fc_mount(fc);
+ mnt = fc_mount_longterm(fc);
put_fs_context(fc);
}
if (IS_ERR(mnt))
- pr_err("Cannot mount internal hugetlbfs for page size %uK",
- 1U << (h->order + PAGE_SHIFT - 10));
+ pr_err("Cannot mount internal hugetlbfs for page size %luK",
+ huge_page_size(h) / SZ_1K);
return mnt;
}
@@ -1548,7 +1625,7 @@ static int __init init_hugetlbfs_fs(void)
goto out_free;
/* default hstate mount is required */
- mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ mnt = mount_one_hugetlbfs(&default_hstate);
if (IS_ERR(mnt)) {
error = PTR_ERR(mnt);
goto out_unreg;