diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 982 |
1 files changed, 652 insertions, 330 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 657bcd887fdb..ebd75684cb0a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,7 +21,7 @@ #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -46,7 +46,10 @@ #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> -#include <asm/pgalloc.h> +#include <linux/sched/mm.h> +#include <linux/sysctl.h> +#include <linux/pgalloc.h> + #include <asm/tlbflush.h> #include "internal.h" @@ -112,27 +115,17 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->lruvec->lru_lock (follow_page->mark_page_accessed) - * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) + * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) + * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) - * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { @@ -150,7 +143,7 @@ static void page_cache_delete(struct address_space *mapping, xas_init_marks(&xas); folio->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ + /* Leave folio->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } @@ -177,7 +170,7 @@ static void filemap_unaccount_folio(struct address_space *mapping, * and we'd rather not leak it: if we're wrong, * another bad page check should catch it later. */ - page_mapcount_reset(&folio->page); + atomic_set(&folio->_mapcount, -1); folio_ref_sub(folio, mapcount); } } @@ -189,15 +182,18 @@ static void filemap_unaccount_folio(struct address_space *mapping, nr = folio_nr_pages(folio); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } + if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, -nr); /* * At this point folio must be either written or cleaned by @@ -235,15 +231,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); - int refs = 1; free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); - if (folio_test_large(folio)) - refs = folio_nr_pages(folio); - folio_put_refs(folio, refs); + folio_put_refs(folio, folio_nr_pages(folio)); } /** @@ -264,7 +257,7 @@ void filemap_remove_folio(struct folio *folio) __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); @@ -343,7 +336,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) @@ -374,80 +367,75 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. + * This is a data integrity operation that waits upon dirty or in writeback + * pages. * * Return: %0 on success, negative error code otherwise. */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); -} - -static inline int __filemap_fdatawrite(struct address_space *mapping, - int sync_mode) +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { - return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); + return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); int filemap_fdatawrite(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_ALL); + return filemap_fdatawrite_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_fdatawrite); -int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end) +/** + * filemap_flush_range - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_flush_range(struct address_space *mapping, loff_t start, + loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); + return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } -EXPORT_SYMBOL(filemap_fdatawrite_range); +EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush @@ -460,10 +448,22 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); */ int filemap_flush(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_NONE); + return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); +/* + * Start writeback on @nr_to_write pages from @mapping. No one but the existing + * btrfs caller should be using this. Talk to linux-mm if you think adding a + * new caller is a good idea. + */ +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) +{ + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); +} +EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); + /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check @@ -530,7 +530,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping, struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); - folio_clear_error(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -682,8 +681,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. @@ -785,8 +783,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); @@ -834,13 +831,13 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) - __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) - __lruvec_stat_add_folio(new, NR_FILE_PAGES); + lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) - __lruvec_stat_sub_folio(old, NR_SHMEM); + lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) - __lruvec_stat_add_folio(new, NR_SHMEM); + lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); @@ -851,18 +848,18 @@ EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { - XA_STATE(xas, &mapping->i_pages, index); - void *alloced_shadow = NULL; - int alloced_order = 0; + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); bool huge; long nr; + unsigned int forder = folio_order(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), + folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); - xas_set_order(&xas, index, folio_order(folio)); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); @@ -872,7 +869,7 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->index = xas.xa_index; for (;;) { - int order = -1, split_order = 0; + int order = -1; void *entry, *old = NULL; xas_lock_irq(&xas); @@ -890,21 +887,25 @@ noinline int __filemap_add_folio(struct address_space *mapping, order = xas_get_order(&xas); } - /* entry may have changed before we re-acquire the lock */ - if (alloced_order && (old != alloced_shadow || order != alloced_order)) { - xas_destroy(&xas); - alloced_order = 0; - } - if (old) { - if (order > 0 && order > folio_order(folio)) { + if (order > 0 && order > forder) { + unsigned int split_order = max(forder, + xas_try_split_min_order(order)); + /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); - if (!alloced_order) { - split_order = order; - goto unlock; + + while (order > forder) { + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, order); + if (xas_error(&xas)) + goto unlock; + order = split_order; + split_order = + max(xas_try_split_min_order( + split_order), + forder); } - xas_split(&xas, old, order); xas_reset(&xas); } if (shadowp) @@ -919,26 +920,15 @@ noinline int __filemap_add_folio(struct address_space *mapping, /* hugetlb pages do not participate in page cache accounting */ if (!huge) { - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } unlock: xas_unlock_irq(&xas); - /* split needed, alloc here and retry. */ - if (split_order) { - xas_split_alloc(&xas, old, split_order, gfp); - if (xas_error(&xas)) - goto error; - alloced_shadow = old; - alloced_order = split_order; - xas_reset(&xas); - continue; - } - if (!xas_nomem(&xas, gfp)) break; } @@ -950,7 +940,7 @@ unlock: return 0; error: folio->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ + /* Leave folio->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } @@ -961,8 +951,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, { void *shadow = NULL; int ret; + struct mem_cgroup *tmp; + bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags); + if (kernel_file) + tmp = set_active_memcg(root_mem_cgroup); ret = mem_cgroup_charge(folio, NULL, gfp); + if (kernel_file) + set_active_memcg(tmp); if (ret) return ret; @@ -984,17 +980,26 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); + if (kernel_file) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, + folio_nr_pages(folio)); } return ret; } EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { int n; struct folio *folio; + if (policy) + return folio_alloc_mpol_noprof(gfp, order, policy, + NO_INTERLEAVE_INDEX, numa_node_id()); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { @@ -1067,6 +1072,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio) return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } +/* How many times do we accept lock stealing from under a waiter? */ +static int sysctl_page_lock_unfairness = 5; +static const struct ctl_table filemap_sysctl_table[] = { + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +}; + void __init pagecache_init(void) { int i; @@ -1075,6 +1093,7 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); + register_sysctl_init("vm", filemap_sysctl_table); } /* @@ -1127,10 +1146,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->folio->flags)) + if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->folio->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } @@ -1213,18 +1232,15 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &folio->flags)) + if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; - } else if (test_bit(bit_nr, &folio->flags)) + } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } -/* How many times do we accept lock stealing from under a waiter? */ -int sysctl_page_lock_unfairness = 5; - static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { @@ -1368,7 +1384,7 @@ repeat: * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is - * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. @@ -1378,7 +1394,7 @@ repeat: * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; @@ -1387,7 +1403,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = pfn_swap_entry_folio(entry); + struct folio *folio = softleaf_to_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { @@ -1472,25 +1488,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) } /** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - -/** * folio_unlock - Unlock a locked folio. * @folio: The folio. * @@ -1531,7 +1528,7 @@ void folio_end_read(struct folio *folio, bool success) /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); + VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; @@ -1598,15 +1595,54 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +static void filemap_end_dropbehind(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (!folio_test_clear_dropbehind(folio)) + return; + if (mapping) + folio_unmap_invalidate(mapping, folio, 0); +} + +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +void folio_end_dropbehind(struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } +} +EXPORT_SYMBOL_GPL(folio_end_dropbehind); + /** - * folio_end_writeback - End writeback against a folio. + * folio_end_writeback_no_dropbehind - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. + * This call is intended for filesystems that need to defer dropbehind. * * Context: May be called from process or interrupt context. */ -void folio_end_writeback(struct folio *folio) +void folio_end_writeback_no_dropbehind(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); @@ -1622,6 +1658,25 @@ void folio_end_writeback(struct folio *folio) folio_rotate_reclaimable(folio); } + if (__folio_end_writeback(folio)) + folio_wake_bit(folio, PG_writeback); + + acct_reclaim_writeback(folio); +} +EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind); + +/** + * folio_end_writeback - End writeback against a folio. + * @folio: The folio. + * + * The folio must actually be under writeback. + * + * Context: May be called from process or interrupt context. + */ +void folio_end_writeback(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); + /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. @@ -1629,9 +1684,8 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); - if (__folio_end_writeback(folio)) - folio_wake_bit(folio, PG_writeback); - acct_reclaim_writeback(folio); + folio_end_writeback_no_dropbehind(folio); + folio_end_dropbehind(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); @@ -1748,16 +1802,17 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); + unsigned long nr = max_scan; - while (max_scan--) { + while (nr--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) - break; + return xas.xa_index; if (xas.xa_index == 0) - break; + return 0; } - return xas.xa_index; + return index + max_scan; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1861,11 +1916,12 @@ out: } /** - * __filemap_get_folio - Find and get a reference to a folio. + * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * @@ -1876,8 +1932,8 @@ out: * * Return: The found folio or an ERR_PTR() otherwise. */ -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp) +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; @@ -1919,8 +1975,10 @@ repeat: folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { - unsigned order = FGF_GET_ORDER(fgp_flags); + unsigned int min_order = mapping_min_folio_order(mapping); + unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; + index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; @@ -1928,15 +1986,13 @@ no_page: gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; - gfp |= GFP_NOWAIT | __GFP_NOWARN; + gfp |= GFP_NOWAIT; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - if (!mapping_large_folio_support(mapping)) - order = 0; - if (order > MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; + if (order > mapping_max_folio_order(mapping)) + order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); @@ -1945,27 +2001,40 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order > 0) + if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order); + folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) break; folio_put(folio); folio = NULL; - } while (order-- > 0); + } while (order-- > min_order); if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -1976,9 +2045,12 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } -EXPORT_SYMBOL(__filemap_get_folio); +EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) @@ -2047,17 +2119,20 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, if (!folio_batch_add(fbatch, folio)) break; } - rcu_read_unlock(); if (folio_batch_count(fbatch)) { - unsigned long nr = 1; + unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); - *start = indices[idx] + nr; + else + nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); + *start = round_down(indices[idx] + nr, nr); } + rcu_read_unlock(); + return folio_batch_count(fbatch); } @@ -2089,10 +2164,17 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long base; + unsigned long nr; + if (!xa_is_value(folio)) { - if (folio->index < *start) + nr = folio_nr_pages(folio); + base = folio->index; + /* Omit large folio which begins before the start */ + if (base < *start) goto put; - if (folio_next_index(folio) - 1 > end) + /* Omit large folio which extends beyond the end */ + if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2101,7 +2183,19 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); + } else { + nr = 1 << xas_get_order(&xas); + base = xas.xa_index & ~(nr - 1); + /* Omit order>0 value which begins before the start */ + if (base < *start) + continue; + /* Omit order>0 value which extends beyond the end */ + if (base + nr - 1 > end) + break; } + + /* Update start now so that last update is correct on return */ + *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; @@ -2113,15 +2207,6 @@ put: } rcu_read_unlock(); - if (folio_batch_count(fbatch)) { - unsigned long nr = 1; - int idx = folio_batch_count(fbatch) - 1; - - folio = fbatch->folios[idx]; - if (!xa_is_value(folio)) - nr = folio_nr_pages(folio); - *start = indices[idx] + nr; - } return folio_batch_count(fbatch); } @@ -2181,6 +2266,10 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (xa_is_value(folio)) goto update_start; + /* If we landed in the middle of a THP, continue at its end. */ + if (xa_is_sibling(folio)) + goto update_start; + if (!folio_try_get(folio)) goto retry; @@ -2192,6 +2281,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, *start = folio->index + nr; goto out; } + xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); @@ -2270,6 +2360,64 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); +/** + * filemap_get_folios_dirty - Get a batch of dirty folios + * @mapping: The address_space to search + * @start: The starting folio index + * @end: The final folio index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except + * the returned folios are presumed to be dirty or undergoing writeback. Dirty + * state is presumed because we don't block on folio lock nor want to miss + * folios. Callers that need to can recheck state upon locking the folio. + * + * This may not return all dirty folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ +unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + if (xa_is_value(folio)) + continue; + if (folio_trylock(folio)) { + bool clean = !folio_test_dirty(folio) && + !folio_test_writeback(folio); + folio_unlock(folio); + if (clean) { + folio_put(folio); + continue; + } + } + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no folio beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a folio at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -2342,13 +2490,6 @@ static int filemap_read_folio(struct file *file, filler_t filler, unsigned long pflags; int error; - /* - * A previous I/O error may have been due to temporary failures, - * eg. multipath errors. PG_error will be set again if read_folio - * fails. - */ - folio_clear_error(folio); - /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); @@ -2389,6 +2530,9 @@ static bool filemap_range_uptodate(struct address_space *mapping, pos -= folio_pos(folio); } + if (pos == 0 && count >= folio_size(folio)) + return false; + return mapping->a_ops->is_partially_uptodate(folio, pos, count); } @@ -2448,16 +2592,22 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, pgoff_t index, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; + unsigned int min_order = mapping_min_folio_order(mapping); + pgoff_t index; + + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2473,6 +2623,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2480,7 +2631,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2501,6 +2653,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2510,31 +2664,36 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; + unsigned int flags; int err = 0; - /* "last_index" is the index of the page beyond the end of the read */ - last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); + /* "last_index" is the index of the folio beyond the end of the read */ + last_index = round_up(iocb->ki_pos + count, + mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT; retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + if (iocb->ki_flags & IOCB_NOWAIT) + flags = memalloc_noio_save(); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); + if (iocb->ki_flags & IOCB_NOWAIT) + memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -2547,15 +2706,17 @@ retry: goto err; } if (!folio_test_uptodate(folio)) { - if ((iocb->ki_flags & IOCB_WAITQ) && - folio_batch_count(fbatch) > 1) - iocb->ki_flags |= IOCB_NOWAIT; + if (folio_batch_count(fbatch) > 1) { + err = -EAGAIN; + goto err; + } err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } + trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) @@ -2574,6 +2735,18 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2600,12 +2773,14 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; + if (unlikely(iocb->ki_pos < 0)) + return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { @@ -2685,8 +2860,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); @@ -2712,14 +2891,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); -int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +int filemap_invalidate_pages(struct address_space *mapping, + loff_t pos, loff_t end, bool nowait) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - loff_t pos = iocb->ki_pos; - loff_t end = pos + count - 1; int ret; - if (iocb->ki_flags & IOCB_NOWAIT) { + if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; @@ -2738,6 +2915,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } + +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + return filemap_invalidate_pages(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1, + iocb->ki_flags & IOCB_NOWAIT); +} EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** @@ -2820,8 +3006,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2878,7 +3063,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -2938,7 +3123,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } @@ -2977,7 +3162,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas, if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; - start = (start + bsz) & ~(bsz - 1); + start = (start + bsz) & ~((u64)bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: @@ -2989,7 +3174,7 @@ unlock: static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) - return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } @@ -3119,12 +3304,48 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; - unsigned long vm_flags = vmf->vma->vm_flags; - unsigned int mmap_miss; + vm_flags_t vm_flags = vmf->vma->vm_flags; + bool force_thp_readahead = false; + unsigned short mmap_miss; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ - if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) + force_thp_readahead = true; + + if (!force_thp_readahead) { + /* + * If we don't want any read-ahead, don't bother. + * VM_EXEC case below is already intended for random access. + */ + if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) + return fpin; + + if (!ra->ra_pages) + return fpin; + + if (vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + page_cache_sync_ra(&ractl, ra->ra_pages); + return fpin; + } + } + + if (!(vm_flags & VM_SEQ_READ)) { + /* Avoid banging the cache line if not needed */ + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss < MMAP_LOTSAMISS * 10) + WRITE_ONCE(ra->mmap_miss, ++mmap_miss); + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (mmap_miss > MMAP_LOTSAMISS) + return fpin; + } + + if (force_thp_readahead) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; @@ -3135,44 +3356,48 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; - page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); + ra->order = HPAGE_PMD_ORDER; + page_cache_ra_order(&ractl, ra); return fpin; } -#endif - - /* If we don't want any read-ahead, don't bother */ - if (vm_flags & VM_RAND_READ) - return fpin; - if (!ra->ra_pages) - return fpin; - if (vm_flags & VM_SEQ_READ) { - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_ra(&ractl, ra->ra_pages); - return fpin; + if (vm_flags & VM_EXEC) { + /* + * Allow arch to request a preferred minimum folio order for + * executable memory. This can often be beneficial to + * performance if (e.g.) arm64 can contpte-map the folio. + * Executable memory rarely benefits from readahead, due to its + * random access nature, so set async_size to 0. + * + * Limit to the boundaries of the VMA to avoid reading in any + * pad that might exist between sections, which would be a waste + * of memory. + */ + struct vm_area_struct *vma = vmf->vma; + unsigned long start = vma->vm_pgoff; + unsigned long end = start + vma_pages(vma); + unsigned long ra_end; + + ra->order = exec_folio_order(); + ra->start = round_down(vmf->pgoff, 1UL << ra->order); + ra->start = max(ra->start, start); + ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order); + ra_end = min(ra_end, end); + ra->size = ra_end - ra->start; + ra->async_size = 0; + } else { + /* + * mmap read-around + */ + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; + ra->order = 0; } - /* Avoid banging the cache line if not needed */ - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss < MMAP_LOTSAMISS * 10) - WRITE_ONCE(ra->mmap_miss, ++mmap_miss); - - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (mmap_miss > MMAP_LOTSAMISS) - return fpin; - - /* - * mmap read-around - */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); - ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); - ra->size = ra->ra_pages; - ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; - page_cache_ra_order(&ractl, ra, 0); + page_cache_ra_order(&ractl, ra); return fpin; } @@ -3188,15 +3413,23 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; - unsigned int mmap_miss; + unsigned short mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss) - WRITE_ONCE(ra->mmap_miss, --mmap_miss); + /* + * If the folio is locked, we're likely racing against another fault. + * Don't touch the mmap_miss counter to avoid decreasing it multiple + * times for a single folio and break the balance with mmap_miss + * increase in do_sync_mmap_readahead(). + */ + if (likely(!folio_test_locked(folio))) { + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss) + WRITE_ONCE(ra->mmap_miss, --mmap_miss); + } if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); @@ -3231,8 +3464,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; - ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); + ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; @@ -3287,6 +3520,8 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; + trace_mm_filemap_fault(mapping, index); + /* * Do we have something in the page cache already? */ @@ -3444,7 +3679,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); - vm_fault_t ret = do_set_pmd(vmf, page); + vm_fault_t ret = do_set_pmd(vmf, folio, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); @@ -3471,10 +3706,10 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, continue; if (xa_is_value(folio)) continue; - if (folio_test_locked(folio)) - continue; if (!folio_try_get(folio)) continue; + if (folio_test_locked(folio)) + goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; @@ -3506,12 +3741,34 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned int *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss, + pgoff_t file_end) { + struct address_space *mapping = folio->mapping; + unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; + unsigned long addr0; + + /* + * Map the large folio fully where possible: + * + * - The folio is fully within size of the file or belong + * to shmem/tmpfs; + * - The folio doesn't cross VMA boundary; + * - The folio doesn't cross page table boundary; + */ + addr0 = addr - start * PAGE_SIZE; + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + folio_within_vma(folio, vmf->vma) && + (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { + vmf->pte -= start; + page -= start; + addr = addr0; + nr_pages = folio_nr_pages(folio); + } do { if (PageHWPoison(page + count)) @@ -3541,7 +3798,8 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; - folio_ref_add(folio, count); + folio_ref_add(folio, count - ref_from_caller); + ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } @@ -3556,25 +3814,29 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; - folio_ref_add(folio, count); + folio_ref_add(folio, count - ref_from_caller); + ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; + if (ref_from_caller) + /* Locked folios cannot get truncated. */ + folio_ref_dec(folio); return ret; } static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned long *rss, unsigned int *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; if (PageHWPoison(page)) - return ret; + goto out; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) @@ -3586,15 +3848,18 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) - return ret; + goto out; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; - folio_ref_inc(folio); + return ret; +out: + /* Locked folios cannot get truncated. */ + folio_ref_dec(folio); return ret; } @@ -3604,20 +3869,32 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - pgoff_t last_pgoff = start_pgoff; + pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; - unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; + unsigned int nr_pages = 0, folio_type; + unsigned short mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; - if (filemap_map_pmd(vmf, folio, start_pgoff)) { + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + end_pgoff = min(end_pgoff, file_end); + + /* + * Do not allow to map with PMD across i_size to preserve + * SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3646,13 +3923,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss); + nr_pages, &rss, &mmap_miss, file_end); folio_unlock(folio); - folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); + trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); @@ -3711,6 +3988,18 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +int generic_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->read_folio) + return -ENOEXEC; + file_accessed(file); + desc->vm_ops = &generic_file_vm_ops; + return 0; +} + /* * This is for filesystems which do not implement ->writepage. */ @@ -3720,6 +4009,13 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; return generic_file_mmap(file, vma); } + +int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) +{ + if (is_shared_maywrite(desc->vm_flags)) + return -EINVAL; + return generic_file_mmap_prepare(desc); +} #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { @@ -3729,15 +4025,25 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } +int generic_file_mmap_prepare(struct vm_area_desc *desc) +{ + return -ENOSYS; +} int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } +int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc) +{ + return -ENOSYS; +} #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_mmap_prepare); EXPORT_SYMBOL(generic_file_readonly_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap_prepare); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) @@ -3750,9 +4056,10 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, 0); + folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); + index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); @@ -3982,50 +4289,51 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; + size_t chunk = mapping_max_folio_size(mapping); long status = 0; ssize_t written = 0; do { - struct page *page; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ + struct folio *folio; + size_t offset; /* Offset into folio */ + size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ void *fsdata = NULL; - offset = (pos & (PAGE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_count(i)); - -again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } + bytes = iov_iter_count(i); +retry: + offset = pos & (chunk - 1); + bytes = min(chunk - offset, bytes); + balance_dirty_pages_ratelimited(mapping); if (fatal_signal_pending(current)) { status = -EINTR; break; } - status = a_ops->write_begin(file, mapping, pos, bytes, - &page, &fsdata); + status = a_ops->write_begin(iocb, mapping, pos, bytes, + &folio, &fsdata); if (unlikely(status < 0)) break; + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); - copied = copy_page_from_iter_atomic(page, offset, bytes, i); - flush_dcache_page(page); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ + copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); + flush_dcache_folio(folio); - status = a_ops->write_end(file, mapping, pos, bytes, copied, - page, fsdata); + status = a_ops->write_end(iocb, mapping, pos, bytes, copied, + folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) @@ -4040,14 +4348,26 @@ again: * halfway through, might be a race with munmap, * might be severe memory pressure. */ - if (copied) + if (chunk > PAGE_SIZE) + chunk /= 2; + if (copied) { bytes = copied; - goto again; - } - pos += status; - written += status; + goto retry; + } - balance_dirty_pages_ratelimited(mapping); + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } + } else { + pos += status; + written += status; + } } while (iov_iter_count(i)); if (!written) @@ -4209,19 +4529,11 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ - if (flush) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - filemap_fdatawrite_wbc(mapping, &wbc); - } + if (flush) + filemap_fdatawrite_range(mapping, start, end); /* Wait for writeback to complete on all folios and discard. */ - truncate_inode_pages_range(mapping, start, end); + invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); unlock: filemap_invalidate_unlock(mapping); @@ -4272,7 +4584,7 @@ static void filemap_cachestat(struct address_space *mapping, if (xas_retry(&xas, folio)) continue; - order = xa_get_order(xas.xa, xas.xa_index); + order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; @@ -4297,7 +4609,7 @@ static void filemap_cachestat(struct address_space *mapping, swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ - if (non_swap_entry(swp)) + if (!softleaf_is_swap(swp)) goto resched; /* @@ -4310,7 +4622,7 @@ static void filemap_cachestat(struct address_space *mapping, * invalidation, so there might not be * a shadow in the swapcache (yet). */ - shadow = get_shadow_from_swap_cache(swp); + shadow = swap_cache_get_shadow(swp); if (!shadow) goto resched; } @@ -4340,6 +4652,20 @@ resched: } /* + * See mincore: reveal pagecache information only for files + * that the calling process has write access to, or could (if + * tried) open for writing. + */ +static inline bool can_do_cachestat(struct file *f) +{ + if (f->f_mode & FMODE_WRITE) + return true; + if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) + return true; + return file_permission(f, MAY_WRITE) == 0; +} + +/* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the @@ -4377,39 +4703,35 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; - if (!f.file) + if (fd_empty(f)) return -EBADF; if (copy_from_user(&csr, cstat_range, - sizeof(struct cachestat_range))) { - fdput(f); + sizeof(struct cachestat_range))) return -EFAULT; - } /* hugetlbfs is not supported */ - if (is_file_hugepages(f.file)) { - fdput(f); + if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; - } - if (flags != 0) { - fdput(f); + if (!can_do_cachestat(fd_file(f))) + return -EPERM; + + if (flags != 0) return -EINVAL; - } first_index = csr.off >> PAGE_SHIFT; last_index = csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; memset(&cs, 0, sizeof(struct cachestat)); - mapping = f.file->f_mapping; + mapping = fd_file(f)->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); - fdput(f); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; |
