diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 8583 |
1 files changed, 6362 insertions, 2221 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 2cff0d491c6d..900c74b6aa62 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/mm/vmscan.c - * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. @@ -11,7 +10,10 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/module.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> @@ -24,8 +26,7 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> -#include <linux/buffer_head.h> /* for try_to_release_page(), - buffer_heads_over_limit */ +#include <linux/buffer_head.h> /* for buffer_heads_over_limit */ #include <linux/mm_inline.h> #include <linux/backing-dev.h> #include <linux/rmap.h> @@ -34,1255 +35,1831 @@ #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> -#include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> +#include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> +#include <linux/memory-tiers.h> #include <linux/oom.h> +#include <linux/pagevec.h> #include <linux/prefetch.h> +#include <linux/printk.h> +#include <linux/dax.h> +#include <linux/psi.h> +#include <linux/pagewalk.h> +#include <linux/shmem_fs.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/khugepaged.h> +#include <linux/rculist_nulls.h> +#include <linux/random.h> +#include <linux/mmu_notifier.h> +#include <linux/parser.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> +#include <linux/balloon_compaction.h> +#include <linux/sched/sysctl.h> #include "internal.h" +#include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; - /* This context's GFP mask */ - gfp_t gfp_mask; + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; - int may_writepage; + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; - /* Can mapped pages be reclaimed? */ - int may_unmap; + /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ + int *proactive_swappiness; - /* Can pages be swapped as part of reclaim? */ - int may_swap; + /* Can active folios be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; - int order; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ + unsigned int may_writepage:1; - /* Scan (total_size >> priority) pages at once */ - int priority; + /* Can mapped folios be reclaimed? */ + unsigned int may_unmap:1; - /* - * The memory cgroup that hit its limit and as a result is the - * primary target of this reclaim invocation. - */ - struct mem_cgroup *target_mem_cgroup; + /* Can folios be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* Not allow cache_trim_mode to be turned on as part of reclaim? */ + unsigned int no_cache_trim_mode:1; + + /* Has cache_trim_mode failed at least once? */ + unsigned int cache_trim_mode_failed:1; + + /* Proactive reclaim invoked by userspace */ + unsigned int proactive:1; /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. + * Cgroup memory below memory.low is protected as long as we + * don't threaten to OOM. If any cgroup is reclaimed at + * reduced force or passed over entirely due to its memory.low + * setting (memcg_low_skipped), and nothing is reclaimed as a + * result, then go back for one more cycle that reclaims the protected + * memory (memcg_low_reclaim) to avert OOM. */ - nodemask_t *nodemask; -}; + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + /* Shared cgroup tree walk failed, rescan the whole tree */ + unsigned int memcg_full_walk:1; -#ifdef ARCH_HAS_PREFETCH -#define prefetch_prev_lru_page(_page, _base, _field) \ - do { \ - if ((_page)->lru.prev != _base) { \ - struct page *prev; \ - \ - prev = lru_to_page(&(_page->lru)); \ - prefetch(&prev->_field); \ - } \ - } while (0) -#else -#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) -#endif + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file folios on the current node are dangerously low */ + unsigned int file_is_tiny:1; + + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate folios for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; +}; #ifdef ARCH_HAS_PREFETCHW -#define prefetchw_prev_lru_page(_page, _base, _field) \ +#define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ - if ((_page)->lru.prev != _base) { \ - struct page *prev; \ + if ((_folio)->lru.prev != _base) { \ + struct folio *prev; \ \ - prev = lru_to_page(&(_page->lru)); \ + prev = lru_to_folio(&(_folio->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) #else -#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ - -static LIST_HEAD(shrinker_list); -static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG -static bool global_reclaim(struct scan_control *sc) + +/* Returns true for reclaim through cgroup limits or cgroup interfaces. */ +static bool cgroup_reclaim(struct scan_control *sc) { - return !sc->target_mem_cgroup; + return sc->target_mem_cgroup; } -#else -static bool global_reclaim(struct scan_control *sc) + +/* + * Returns true for reclaim on the root cgroup. This is true for direct + * allocator reclaim and reclaim through cgroup interfaces on the root cgroup. + */ +static bool root_reclaim(struct scan_control *sc) { - return true; + return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); } -#endif -static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +/** + * writeback_throttling_sane - is the usual dirty throttling mechanism available? + * @sc: scan_control in question + * + * The normal page dirty throttling mechanism in balance_dirty_pages() is + * completely broken with the legacy memcg and direct stalling in + * shrink_folio_list() is used for throttling instead, which lacks all the + * niceties such as fairness, adaptive pausing, bandwidth proportional + * allocation and configurability. + * + * This function tests whether the vmscan currently in progress can assume + * that the normal dirty throttling mechanism is operational. + */ +static bool writeback_throttling_sane(struct scan_control *sc) { - if (!mem_cgroup_disabled()) - return mem_cgroup_get_lru_size(lruvec, lru); + if (!cgroup_reclaim(sc)) + return true; +#ifdef CONFIG_CGROUP_WRITEBACK + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; +#endif + return false; +} - return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) +{ + if (sc->proactive && sc->proactive_swappiness) + return *sc->proactive_swappiness; + return mem_cgroup_swappiness(memcg); +} +#else +static bool cgroup_reclaim(struct scan_control *sc) +{ + return false; } -/* - * Add a shrinker callback to be called from the vm - */ -void register_shrinker(struct shrinker *shrinker) +static bool root_reclaim(struct scan_control *sc) { - atomic_long_set(&shrinker->nr_in_batch, 0); - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - up_write(&shrinker_rwsem); + return true; } -EXPORT_SYMBOL(register_shrinker); -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) +static bool writeback_throttling_sane(struct scan_control *sc) { - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - up_write(&shrinker_rwsem); + return true; } -EXPORT_SYMBOL(unregister_shrinker); -static inline int do_shrinker_shrink(struct shrinker *shrinker, - struct shrink_control *sc, - unsigned long nr_to_scan) +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) { - sc->nr_to_scan = nr_to_scan; - return (*shrinker->shrink)(shrinker, sc); + return READ_ONCE(vm_swappiness); } +#endif -#define SHRINK_BATCH 128 -/* - * Call the shrink functions to age shrinkable caches - * - * Here we assume it costs one seek to replace a lru page and that it also - * takes a seek to recreate a cache object. With this in mind we age equal - * percentages of the lru and ageable caches. This should balance the seeks - * generated by these structures. - * - * If the vm encountered mapped pages on the LRU it increase the pressure on - * slab to avoid swapping. - * - * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return * - * `lru_pages' represents the number of on-LRU pages in all the zones which - * are eligible for the caller's allocation attempt. It is used for balancing - * slab reclaim versus page reclaim. - * - * Returns the number of slab objects which we shrunk. + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. */ -unsigned long shrink_slab(struct shrink_control *shrink, - unsigned long nr_pages_scanned, - unsigned long lru_pages) +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) { - struct shrinker *shrinker; - unsigned long ret = 0; + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); - if (nr_pages_scanned == 0) - nr_pages_scanned = SWAP_CLUSTER_MAX; + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); - if (!down_read_trylock(&shrinker_rwsem)) { - /* Assume we'll be able to shrink next time */ - ret = 1; - goto out; + task->reclaim_state = rs; +} + +/* + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to + * scan_control->nr_reclaimed. + */ +static void flush_reclaim_state(struct scan_control *sc) +{ + /* + * Currently, reclaim_state->reclaimed includes three types of pages + * freed outside of vmscan: + * (1) Slab pages. + * (2) Clean file pages from pruned inodes (on highmem systems). + * (3) XFS freed buffer pages. + * + * For all of these cases, we cannot universally link the pages to a + * single memcg. For example, a memcg-aware shrinker can free one object + * charged to the target memcg, causing an entire page to be freed. + * If we count the entire page as reclaimed from the memcg, we end up + * overestimating the reclaimed amount (potentially under-reclaiming). + * + * Only count such pages for global reclaim to prevent under-reclaiming + * from the target memcg; preventing unnecessary retries during memcg + * charging and false positives from proactive reclaim. + * + * For uncommon cases where the freed pages were actually mostly + * charged to the target memcg, we end up underestimating the reclaimed + * amount. This should be fine. The freed pages will be uncharged + * anyway, even if they are not counted here properly, and we will be + * able to make forward progress in charging (which is usually in a + * retry loop). + * + * We can go one step further, and report the uncharged objcg pages in + * memcg reclaim, to make reporting more accurate and reduce + * underestimation, but it's probably not worth the complexity for now. + */ + if (current->reclaim_state && root_reclaim(sc)) { + sc->nr_reclaimed += current->reclaim_state->reclaimed; + current->reclaim_state->reclaimed = 0; } +} - list_for_each_entry(shrinker, &shrinker_list, list) { - unsigned long long delta; - long total_scan; - long max_pass; - int shrink_ret = 0; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; +static bool can_demote(int nid, struct scan_control *sc, + struct mem_cgroup *memcg) +{ + int demotion_nid; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); - if (max_pass <= 0) - continue; + if (!numa_demotion_enabled) + return false; + if (sc && sc->no_demotion) + return false; - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); + demotion_nid = next_demotion_node(nid); + if (demotion_nid == NUMA_NO_NODE) + return false; - total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; - do_div(delta, lru_pages + 1); - total_scan += delta; - if (total_scan < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", - shrinker->shrink, total_scan); - total_scan = max_pass; - } + /* If demotion node isn't in the cgroup's mems_allowed, fall back */ + return mem_cgroup_node_allowed(memcg, demotion_nid); +} +static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, + int nid, + struct scan_control *sc) +{ + if (memcg == NULL) { /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. + * For non-memcg reclaim, is there + * space in any swap device? */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); + if (get_nr_swap_pages() > 0) + return true; + } else { + /* Is the memcg below its swap limit? */ + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) + return true; + } - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; + /* + * The page can not be swapped. + * + * Can it be reclaimed from this node via demotion? + */ + return can_demote(nid, sc, memcg); +} - trace_mm_shrink_slab_start(shrinker, shrink, nr, - nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); +/* + * This misses isolated folios which are not accounted for to save counters. + * As the data only determines if reclaim or compaction continues, it is + * not expected that isolated folios will be a dominating factor. + */ +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + unsigned long nr; - while (total_scan >= batch_size) { - int nr_before; + nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); + if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) + nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - nr_before = do_shrinker_shrink(shrinker, shrink, 0); - shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) - ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + return nr; +} - cond_resched(); - } +/** + * lruvec_lru_size - Returns the number of pages on the given LRU list. + * @lruvec: lru vector + * @lru: lru to use + * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) + */ +static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + unsigned long size = 0; + int zid; + struct zone *zone; - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. - */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, - &shrinker->nr_in_batch); + for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) { + if (!mem_cgroup_disabled()) + size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else - new_nr = atomic_long_read(&shrinker->nr_in_batch); - - trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); } - up_read(&shrinker_rwsem); -out: - cond_resched(); - return ret; + return size; } -static inline int is_page_cache_freeable(struct page *page) +static unsigned long drop_slab_node(int nid) { - /* - * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache radix tree and - * optional buffer heads at page->private. - */ - return page_count(page) - page_has_private(page) == 2; + unsigned long freed = 0; + struct mem_cgroup *memcg = NULL; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + + return freed; } -static int may_write_to_queue(struct backing_dev_info *bdi, - struct scan_control *sc) +void drop_slab(void) { - if (current->flags & PF_SWAPWRITE) - return 1; - if (!bdi_write_congested(bdi)) - return 1; - if (bdi == current->backing_dev_info) - return 1; - return 0; + int nid; + int shift = 0; + unsigned long freed; + + do { + freed = 0; + for_each_online_node(nid) { + if (fatal_signal_pending(current)) + return; + + freed += drop_slab_node(nid); + } + } while ((freed >> shift++) > 1); +} + +#define CHECK_RECLAIMER_OFFSET(type) \ + do { \ + BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ + PGDEMOTE_##type - PGDEMOTE_KSWAPD); \ + BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ + PGSCAN_##type - PGSCAN_KSWAPD); \ + } while (0) + +static int reclaimer_offset(struct scan_control *sc) +{ + CHECK_RECLAIMER_OFFSET(DIRECT); + CHECK_RECLAIMER_OFFSET(KHUGEPAGED); + CHECK_RECLAIMER_OFFSET(PROACTIVE); + + if (current_is_kswapd()) + return 0; + if (current_is_khugepaged()) + return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + if (sc->proactive) + return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD; + return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } /* - * We detected a synchronous write error writing a page out. Probably + * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing - * prevents it from being freed up. But we have a ref on the page and once - * that page is locked, the mapping is pinned. + * prevents it from being freed up. But we have a ref on the folio and once + * that folio is locked, the mapping is pinned. * - * We're allowed to run sleeping lock_page() here because we know the caller has + * We're allowed to run sleeping folio_lock() here because we know the caller has * __GFP_FS. */ static void handle_write_error(struct address_space *mapping, - struct page *page, int error) + struct folio *folio, int error) { - lock_page(page); - if (page_mapping(page) == mapping) + folio_lock(folio); + if (folio_mapping(folio) == mapping) mapping_set_error(mapping, error); - unlock_page(page); + folio_unlock(folio); +} + +static bool skip_throttle_noprogress(pg_data_t *pgdat) +{ + int reclaimable = 0, write_pending = 0; + int i; + struct zone *zone; + /* + * If kswapd is disabled, reschedule if necessary but do not + * throttle as the system is likely near OOM. + */ + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + return true; + + /* + * If there are a lot of dirty/writeback folios then do not + * throttle as throttling will occur when the folios cycle + * towards the end of the LRU if still under writeback. + */ + for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) { + reclaimable += zone_reclaimable_pages(zone); + write_pending += zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); + } + if (2 * write_pending <= reclaimable) + return true; + + return false; +} + +void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) +{ + wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; + long timeout, ret; + DEFINE_WAIT(wait); + + /* + * Do not throttle user workers, kthreads other than kswapd or + * workqueues. They may be required for reclaim to make + * forward progress (e.g. journalling workqueues or kthreads). + */ + if (!current_is_kswapd() && + current->flags & (PF_USER_WORKER|PF_KTHREAD)) { + cond_resched(); + return; + } + + /* + * These figures are pulled out of thin air. + * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many + * parallel reclaimers which is a short-lived event so the timeout is + * short. Failing to make progress or waiting on writeback are + * potentially long-lived events so use a longer timeout. This is shaky + * logic as a failure to make progress could be due to anything from + * writeback to a slow device to excessive referenced folios at the tail + * of the inactive LRU. + */ + switch(reason) { + case VMSCAN_THROTTLE_WRITEBACK: + timeout = HZ/10; + + if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { + WRITE_ONCE(pgdat->nr_reclaim_start, + node_page_state(pgdat, NR_THROTTLED_WRITTEN)); + } + + break; + case VMSCAN_THROTTLE_CONGESTED: + fallthrough; + case VMSCAN_THROTTLE_NOPROGRESS: + if (skip_throttle_noprogress(pgdat)) { + cond_resched(); + return; + } + + timeout = 1; + + break; + case VMSCAN_THROTTLE_ISOLATED: + timeout = HZ/50; + break; + default: + WARN_ON_ONCE(1); + timeout = HZ; + break; + } + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = schedule_timeout(timeout); + finish_wait(wqh, &wait); + + if (reason == VMSCAN_THROTTLE_WRITEBACK) + atomic_dec(&pgdat->nr_writeback_throttled); + + trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), + jiffies_to_usecs(timeout - ret), + reason); +} + +/* + * Account for folios written if tasks are throttled waiting on dirty + * folios to clean. If enough folios have been cleaned since throttling + * started then wakeup the throttled tasks. + */ +void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, + int nr_throttled) +{ + unsigned long nr_written; + + node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); + + /* + * This is an inaccurate read as the per-cpu deltas may not + * be synchronised. However, given that the system is + * writeback throttled, it is not worth taking the penalty + * of getting an accurate count. At worst, the throttle + * timeout guarantees forward progress. + */ + nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - + READ_ONCE(pgdat->nr_reclaim_start); + + if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) + wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); } /* possible outcome of pageout() */ typedef enum { - /* failed to write page out, page is locked */ + /* failed to write folio out, folio is locked */ PAGE_KEEP, - /* move page to the active list, page is locked */ + /* move folio to the active list, folio is locked */ PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ + /* folio has been sent to the disk successfully, folio is unlocked */ PAGE_SUCCESS, - /* page is clean and locked */ + /* folio is clean and locked */ PAGE_CLEAN, } pageout_t; +static pageout_t writeout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug, struct list_head *folio_list) +{ + int res; + + folio_set_reclaim(folio); + + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled + * or we failed to allocate contiguous swap entries, in which case + * the split out folios get added back to folio_list. + */ + if (shmem_mapping(mapping)) + res = shmem_writeout(folio, plug, folio_list); + else + res = swap_writeout(folio, plug); + + if (res < 0) + handle_write_error(mapping, folio, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + folio_clear_reclaim(folio); + return PAGE_ACTIVATE; + } + + /* synchronous write? */ + if (!folio_test_writeback(folio)) + folio_clear_reclaim(folio); + + trace_mm_vmscan_write_folio(folio); + node_stat_add_folio(folio, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; +} + /* - * pageout is called by shrink_page_list() for each dirty page. - * Calls ->writepage(). + * pageout is called by shrink_folio_list() for each dirty folio. */ -static pageout_t pageout(struct page *page, struct address_space *mapping, - struct scan_control *sc) +static pageout_t pageout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug, struct list_head *folio_list) { /* - * If the page is dirty, only perform writeback if that write - * will be non-blocking. To prevent this allocation from being - * stalled by pagecache activity. But note that there may be - * stalls if we need to run get_block(). We could test - * PagePrivate for that. + * We no longer attempt to writeback filesystem folios here, other + * than tmpfs/shmem. That's taken care of in page-writeback. + * If we find a dirty filesystem folio at the end of the LRU list, + * typically that means the filesystem is saturating the storage + * with contiguous writes and telling it to write a folio here + * would only make the situation worse by injecting an element + * of random access. * - * If this process is currently in __generic_file_aio_write() against - * this page's queue, we can perform writeback even if that - * will block. - * - * If the page is swapcache, write it back even if that would + * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. + * + * A freeable shmem or swapcache folio is referenced only by the + * caller that isolated the folio and the page cache. */ - if (!is_page_cache_freeable(page)) + if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping) return PAGE_KEEP; - if (!mapping) { - /* - * Some data journaling orphaned pages can have - * page->mapping == NULL while being dirty with clean buffers. - */ - if (page_has_private(page)) { - if (try_to_free_buffers(page)) { - ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); - return PAGE_CLEAN; - } - } - return PAGE_KEEP; - } - if (mapping->a_ops->writepage == NULL) + if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info, sc)) - return PAGE_KEEP; - - if (clear_page_dirty_for_io(page)) { - int res; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = SWAP_CLUSTER_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1, - }; - - SetPageReclaim(page); - res = mapping->a_ops->writepage(page, &wbc); - if (res < 0) - handle_write_error(mapping, page, res); - if (res == AOP_WRITEPAGE_ACTIVATE) { - ClearPageReclaim(page); - return PAGE_ACTIVATE; - } - - if (!PageWriteback(page)) { - /* synchronous write or broken a_ops? */ - ClearPageReclaim(page); - } - trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); - inc_zone_page_state(page, NR_VMSCAN_WRITE); - return PAGE_SUCCESS; - } - - return PAGE_CLEAN; + if (!folio_clear_dirty_for_io(folio)) + return PAGE_CLEAN; + return writeout(folio, mapping, plug, folio_list); } /* - * Same as remove_mapping, but if the page is removed from the mapping, it + * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct folio *folio, + bool reclaimed, struct mem_cgroup *target_memcg) { - BUG_ON(!PageLocked(page)); - BUG_ON(mapping != page_mapping(page)); + int refcount; + void *shadow = NULL; + struct swap_cluster_info *ci; + + BUG_ON(!folio_test_locked(folio)); + BUG_ON(mapping != folio_mapping(folio)); + + if (folio_test_swapcache(folio)) { + ci = swap_cluster_get_and_lock_irq(folio); + } else { + spin_lock(&mapping->host->i_lock); + xa_lock_irq(&mapping->i_pages); + } - spin_lock_irq(&mapping->tree_lock); /* - * The non racy check for a busy page. + * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has - * a ref to the page, it may be possible that they dirty it then - * drop the reference. So if PageDirty is tested before page_count - * here, then the following race may occur: + * a ref to the folio, it may be possible that they dirty it then + * drop the reference. So if the dirty flag is tested before the + * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); - * !PageDirty(page) [good] - * SetPageDirty(page); - * put_page(page); - * !page_count(page) [good, discard it] + * !folio_test_dirty(folio) [good] + * folio_set_dirty(folio); + * folio_put(folio); + * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot - * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_count. + * escape unnoticed. The smp_rmb is needed to ensure the folio->flags + * load is not satisfied before that of folio->_refcount. * - * Note that if SetPageDirty is always performed via set_page_dirty, - * and thus under tree_lock, then this ordering is not required. + * Note that if the dirty flag is always set via folio_mark_dirty, + * and thus under the i_pages lock, then this ordering is not required. */ - if (!page_freeze_refs(page, 2)) + refcount = 1 + folio_nr_pages(folio); + if (!folio_ref_freeze(folio, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ - if (unlikely(PageDirty(page))) { - page_unfreeze_refs(page, 2); + /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ + if (unlikely(folio_test_dirty(folio))) { + folio_ref_unfreeze(folio, refcount); goto cannot_free; } - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; - __delete_from_swap_cache(page); - spin_unlock_irq(&mapping->tree_lock); - swapcache_free(swap, page); - } else { - void (*freepage)(struct page *); - - freepage = mapping->a_ops->freepage; + if (folio_test_swapcache(folio)) { + swp_entry_t swap = folio->swap; - __delete_from_page_cache(page); - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(folio, target_memcg); + __swap_cache_del_folio(ci, folio, swap, shadow); + memcg1_swapout(folio, swap); + swap_cluster_unlock_irq(ci); + put_swap_folio(folio, swap); + } else { + void (*free_folio)(struct folio *); - if (freepage != NULL) - freepage(page); + free_folio = mapping->a_ops->free_folio; + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optimization, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + * + * We also don't store shadows for DAX mappings because the + * only page cache folios found in these are zero pages + * covering holes, and because we don't want to mix DAX + * exceptional entries and shadow exceptional entries in the + * same address_space. + */ + if (reclaimed && folio_is_file_lru(folio) && + !mapping_exiting(mapping) && !dax_mapping(mapping)) + shadow = workingset_eviction(folio, target_memcg); + __filemap_remove_folio(folio, shadow); + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_lru_list_add(mapping->host); + spin_unlock(&mapping->host->i_lock); + + if (free_folio) + free_folio(folio); } return 1; cannot_free: - spin_unlock_irq(&mapping->tree_lock); + if (folio_test_swapcache(folio)) { + swap_cluster_unlock_irq(ci); + } else { + xa_unlock_irq(&mapping->i_pages); + spin_unlock(&mapping->host->i_lock); + } return 0; } -/* - * Attempt to detach a locked page from its ->mapping. If it is dirty or if - * someone else has a ref on the page, abort and return 0. If it was - * successfully detached, return 1. Assumes the caller has a single ref on - * this page. +/** + * remove_mapping() - Attempt to remove a folio from its mapping. + * @mapping: The address space. + * @folio: The folio to remove. + * + * If the folio is dirty, under writeback or if someone else has a ref + * on it, removal will fail. + * Return: The number of pages removed from the mapping. 0 if the folio + * could not be removed. + * Context: The caller should have a single refcount on the folio and + * hold its lock. */ -int remove_mapping(struct address_space *mapping, struct page *page) +long remove_mapping(struct address_space *mapping, struct folio *folio) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, folio, false, NULL)) { /* - * Unfreezing the refcount with 1 rather than 2 effectively + * Unfreezing the refcount with 1 effectively * drops the pagecache ref for us without requiring another * atomic operation. */ - page_unfreeze_refs(page, 1); - return 1; + folio_ref_unfreeze(folio, 1); + return folio_nr_pages(folio); } return 0; } /** - * putback_lru_page - put previously isolated page onto appropriate LRU list - * @page: page to be put back to appropriate lru list + * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. + * @folio: Folio to be returned to an LRU list. * - * Add previously isolated @page to appropriate LRU list. - * Page may still be unevictable for other reasons. + * Add previously isolated @folio to appropriate LRU list. + * The folio may still be unevictable for other reasons. * - * lru_lock must not be held, interrupts must be enabled. + * Context: lru_lock must not be held, interrupts must be enabled. */ -void putback_lru_page(struct page *page) +void folio_putback_lru(struct folio *folio) { - int lru; - int was_unevictable = PageUnevictable(page); - - VM_BUG_ON(PageLRU(page)); - -redo: - ClearPageUnevictable(page); + folio_add_lru(folio); + folio_put(folio); /* drop ref from isolate */ +} - if (page_evictable(page)) { - /* - * For evictable pages, we can use the cache. - * In event of a race, worst case is we end up with an - * unevictable page on [in]active list. - * We know how to handle that. - */ - lru = page_lru_base_type(page); - lru_cache_add(page); - } else { - /* - * Put unevictable pages directly on zone's unevictable - * list. - */ - lru = LRU_UNEVICTABLE; - add_page_to_unevictable_list(page); - /* - * When racing with an mlock or AS_UNEVICTABLE clearing - * (page is unlocked) make sure that if the other thread - * does not observe our setting of PG_lru and fails - * isolation/check_move_unevictable_pages, - * we see PG_mlocked/AS_UNEVICTABLE cleared below and move - * the page back to the evictable list. - * - * The other side is TestClearPageMlocked() or shmem_lock(). - */ - smp_mb(); - } +enum folio_references { + FOLIOREF_RECLAIM, + FOLIOREF_RECLAIM_CLEAN, + FOLIOREF_KEEP, + FOLIOREF_ACTIVATE, +}; - /* - * page's status can change while we move it among lru. If an evictable - * page is on unevictable list, it never be freed. To avoid that, - * check after we added it to the list, again. - */ - if (lru == LRU_UNEVICTABLE && page_evictable(page)) { - if (!isolate_lru_page(page)) { - put_page(page); - goto redo; - } - /* This means someone else dropped this page from LRU - * So, it will be freed or putback to LRU again. There is - * nothing to do here. - */ +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); + return false; } - if (was_unevictable && lru != LRU_UNEVICTABLE) - count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) - count_vm_event(UNEVICTABLE_PGCULLED); - - put_page(page); /* drop ref from isolate */ + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; } +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ -enum page_references { - PAGEREF_RECLAIM, - PAGEREF_RECLAIM_CLEAN, - PAGEREF_KEEP, - PAGEREF_ACTIVATE, -}; - -static enum page_references page_check_references(struct page *page, +static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { - int referenced_ptes, referenced_page; - unsigned long vm_flags; + int referenced_ptes, referenced_folio; + vm_flags_t vm_flags; - referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, - &vm_flags); - referenced_page = TestClearPageReferenced(page); + referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, + &vm_flags); /* - * Mlock lost the isolation race with us. Let try_to_unmap() - * move the page to the unevictable list. + * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. + * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_RECLAIM; + return FOLIOREF_ACTIVATE; + + /* + * There are two cases to consider. + * 1) Rmap lock contention: rotate. + * 2) Skip the non-shared swapbacked folio mapped solely by + * the exiting or OOM-reaped process. + */ + if (referenced_ptes == -1) + return FOLIOREF_KEEP; + + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); if (referenced_ptes) { - if (PageSwapBacked(page)) - return PAGEREF_ACTIVATE; /* - * All mapped pages start out with page table + * All mapped folios start out with page table * references from the instantiating fault, so we need - * to look twice if a mapped file page is used more + * to look twice if a mapped file/anon folio is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * - * Note: the mark is set for activated pages as well - * so that recently deactivated but used pages are + * Note: the mark is set for activated folios as well + * so that recently deactivated but used folios are * quickly recovered. */ - SetPageReferenced(page); + folio_set_referenced(folio); - if (referenced_page || referenced_ptes > 1) - return PAGEREF_ACTIVATE; + if (referenced_folio || referenced_ptes > 1) + return FOLIOREF_ACTIVATE; /* - * Activate file-backed executable pages after first usage. + * Activate file-backed executable folios after first usage. */ - if (vm_flags & VM_EXEC) - return PAGEREF_ACTIVATE; + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) + return FOLIOREF_ACTIVATE; - return PAGEREF_KEEP; + return FOLIOREF_KEEP; } - /* Reclaim if clean, defer dirty pages to writeback */ - if (referenced_page && !PageSwapBacked(page)) - return PAGEREF_RECLAIM_CLEAN; + /* Reclaim if clean, defer dirty folios to writeback */ + if (referenced_folio && folio_is_file_lru(folio)) + return FOLIOREF_RECLAIM_CLEAN; - return PAGEREF_RECLAIM; + return FOLIOREF_RECLAIM; } -/* Check if a page is dirty or under writeback */ -static void page_check_dirty_writeback(struct page *page, +/* Check if a folio is dirty or under writeback */ +static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; /* - * Anonymous pages are not handled by flushers and must be written - * from reclaim context. Do not stall reclaim based on them + * Anonymous folios are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them. + * MADV_FREE anonymous folios are put into inactive file list too. + * They could be mistakenly treated as file lru. So further anon + * test is needed. */ - if (!page_is_file_cache(page)) { + if (!folio_is_file_lru(folio) || + (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { *dirty = false; *writeback = false; return; } - /* By default assume that the page flags are accurate */ - *dirty = PageDirty(page); - *writeback = PageWriteback(page); + /* By default assume that the folio flags are accurate */ + *dirty = folio_test_dirty(folio); + *writeback = folio_test_writeback(folio); /* Verify dirty/writeback state if the filesystem supports it */ - if (!page_has_private(page)) + if (!folio_test_private(folio)) return; - mapping = page_mapping(page); + mapping = folio_mapping(folio); if (mapping && mapping->a_ops->is_dirty_writeback) - mapping->a_ops->is_dirty_writeback(page, dirty, writeback); + mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); +} + +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) +{ + struct folio *dst; + nodemask_t *allowed_mask; + struct migration_target_control *mtc; + + mtc = (struct migration_target_control *)private; + + allowed_mask = mtc->nmask; + /* + * make sure we allocate from the target node first also trying to + * demote or reclaim pages from the target node via kswapd if we are + * low on free memory on target node. If we don't do this and if + * we have free memory on the slower(lower) memtier, we would start + * allocating pages from slower(lower) memory tiers without even forcing + * a demotion of cold pages from the target memtier. This can result + * in the kernel placing hot pages in slower(lower) memory tiers. + */ + mtc->nmask = NULL; + mtc->gfp_mask |= __GFP_THISNODE; + dst = alloc_migration_target(src, (unsigned long)mtc); + if (dst) + return dst; + + mtc->gfp_mask &= ~__GFP_THISNODE; + mtc->nmask = allowed_mask; + + return alloc_migration_target(src, (unsigned long)mtc); } /* - * shrink_page_list() returns the number of reclaimed pages + * Take folios on @demote_folios and attempt to demote them to another node. + * Folios which are not demoted are left on @demote_folios. */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, - struct scan_control *sc, - enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, - unsigned long *ret_nr_unqueued_dirty, - unsigned long *ret_nr_congested, - unsigned long *ret_nr_writeback, - unsigned long *ret_nr_immediate, - bool force_reclaim) -{ - LIST_HEAD(ret_pages); - LIST_HEAD(free_pages); - int pgactivate = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_reclaimed = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; +static unsigned int demote_folio_list(struct list_head *demote_folios, + struct pglist_data *pgdat) +{ + int target_nid = next_demotion_node(pgdat->node_id); + unsigned int nr_succeeded; + nodemask_t allowed_mask; + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = target_nid, + .nmask = &allowed_mask, + .reason = MR_DEMOTION, + }; + + if (list_empty(demote_folios)) + return 0; + + if (target_nid == NUMA_NO_NODE) + return 0; + + node_get_allowed_targets(pgdat, &allowed_mask); + + /* Demotion ignores all cpuset and mempolicy settings */ + migrate_pages(demote_folios, alloc_demote_folio, NULL, + (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); + + return nr_succeeded; +} + +static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_FS) + return true; + if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) + return false; + /* + * We can "enter_fs" for swap-cache with only __GFP_IO + * providing this isn't SWP_FS_OPS. + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); +} + +/* + * shrink_folio_list() returns the number of reclaimed pages + */ +static unsigned int shrink_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, struct scan_control *sc, + struct reclaim_stat *stat, bool ignore_references, + struct mem_cgroup *memcg) +{ + struct folio_batch free_folios; + LIST_HEAD(ret_folios); + LIST_HEAD(demote_folios); + unsigned int nr_reclaimed = 0, nr_demoted = 0; + unsigned int pgactivate = 0; + bool do_demote_pass; + struct swap_iocb *plug = NULL; + + folio_batch_init(&free_folios); + memset(stat, 0, sizeof(*stat)); cond_resched(); + do_demote_pass = can_demote(pgdat->node_id, sc, memcg); - mem_cgroup_uncharge_start(); - while (!list_empty(page_list)) { +retry: + while (!list_empty(folio_list)) { struct address_space *mapping; - struct page *page; - int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + struct folio *folio; + enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; + unsigned int nr_pages; cond_resched(); - page = lru_to_page(page_list); - list_del(&page->lru); + folio = lru_to_folio(folio_list); + list_del(&folio->lru); - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + if (folio_contain_hwpoisoned_page(folio)) { + /* + * unmap_poisoned_folio() can't handle large + * folio, just skip it. memory_failure() will + * handle it if the UCE is triggered again. + */ + if (folio_test_large(folio)) + goto keep_locked; + + unmap_poisoned_folio(folio, folio_pfn(folio), false); + folio_unlock(folio); + folio_put(folio); + continue; + } - sc->nr_scanned++; + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); - if (unlikely(!page_evictable(page))) - goto cull_mlocked; + nr_pages = folio_nr_pages(folio); - if (!sc->may_unmap && page_mapped(page)) - goto keep_locked; + /* Account the number of base pages */ + sc->nr_scanned += nr_pages; - /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) - sc->nr_scanned++; + if (unlikely(!folio_evictable(folio))) + goto activate_locked; - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + if (!sc->may_unmap && folio_mapped(folio)) + goto keep_locked; /* - * The number of dirty pages determines if a zone is marked - * reclaim_congested which affects wait_iff_congested. kswapd - * will stall and start writing pages if the tail of the LRU - * is all dirty unqueued pages. + * The number of dirty pages determines if a node is marked + * reclaim_congested. kswapd will stall and start writing + * folios if the tail of the LRU is all dirty unqueued folios. */ - page_check_dirty_writeback(page, &dirty, &writeback); + folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) - nr_dirty++; + stat->nr_dirty += nr_pages; if (dirty && !writeback) - nr_unqueued_dirty++; + stat->nr_unqueued_dirty += nr_pages; /* - * Treat this page as congested if the underlying BDI is or if - * pages are cycling through the LRU so quickly that the - * pages marked for immediate reclaim are making it to the - * end of the LRU a second time. + * Treat this folio as congested if folios are cycling + * through the LRU so quickly that the folios marked + * for immediate reclaim are making it to the end of + * the LRU a second time. */ - mapping = page_mapping(page); - if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || - (writeback && PageReclaim(page))) - nr_congested++; + if (writeback && folio_test_reclaim(folio)) + stat->nr_congested += nr_pages; /* - * If a page at the tail of the LRU is under writeback, there + * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * - * 1) If reclaim is encountering an excessive number of pages - * under writeback and this page is both under writeback and - * PageReclaim then it indicates that pages are being queued - * for IO but are being recycled through the LRU before the - * IO can complete. Waiting on the page itself risks an - * indefinite stall if it is impossible to writeback the - * page due to IO error or disconnected storage so instead - * note that the LRU is being scanned too quickly and the - * caller can stall after page list has been processed. + * 1) If reclaim is encountering an excessive number + * of folios under writeback and this folio has both + * the writeback and reclaim flags set, then it + * indicates that folios are being queued for I/O but + * are being recycled through the LRU before the I/O + * can complete. Waiting on the folio itself risks an + * indefinite stall if it is impossible to writeback + * the folio due to I/O error or disconnected storage + * so instead note that the LRU is being scanned too + * quickly and the caller can stall after the folio + * list has been processed. * - * 2) Global reclaim encounters a page, memcg encounters a - * page that is not marked for immediate reclaim or - * the caller does not have __GFP_IO. In this case mark - * the page for immediate reclaim and continue scanning. + * 2) Global or new memcg reclaim encounters a folio that is + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs), or the folio belongs to a mapping where + * waiting on writeback during reclaim may lead to a deadlock. + * In this case mark the folio for immediate reclaim and + * continue scanning. * - * __GFP_IO is checked because a loop driver thread might - * enter reclaim, and deadlock if it waits on a page for + * Require may_enter_fs() because we would wait on fs, which + * may not have submitted I/O yet. And the loop driver might + * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * Don't require __GFP_FS, since we're not going into the - * FS, just waiting on its writeback completion. Worryingly, - * ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing - * may_enter_fs here is liable to OOM on them. - * - * 3) memcg encounters a page that is not already marked - * PageReclaim. memcg does not have any dirty pages + * 3) Legacy memcg encounters a folio that already has the + * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many - * pages are in writeback and there is nothing else to + * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. + * + * In cases 1) and 2) we activate the folios to get them out of + * the way while we continue scanning for clean folios on the + * inactive list and refilling from the active list. The + * observation here is that waiting for disk writes is more + * expensive than potentially causing reloads down the line. + * Since they're marked for immediate reclaim, they won't put + * memory pressure on the cache working set any longer than it + * takes to write them to disk. */ - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { + mapping = folio_mapping(folio); + /* Case 1 above */ if (current_is_kswapd() && - PageReclaim(page) && - zone_is_reclaim_writeback(zone)) { - nr_immediate++; - goto keep_locked; + folio_test_reclaim(folio) && + test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { + stat->nr_immediate += nr_pages; + goto activate_locked; /* Case 2 above */ - } else if (global_reclaim(sc) || - !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + } else if (writeback_throttling_sane(sc) || + !folio_test_reclaim(folio) || + !may_enter_fs(folio, sc->gfp_mask) || + (mapping && + mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* - * This is slightly racy - end_page_writeback() - * might have just cleared PageReclaim, then - * setting PageReclaim here end up interpreted - * as PageReadahead - but that does not matter - * enough to care. What we do want is for this - * page to have PageReclaim set next time memcg - * reclaim reaches the tests above, so it will - * then wait_on_page_writeback() to avoid OOM; - * and it's also appropriate in global reclaim. + * This is slightly racy - + * folio_end_writeback() might have + * just cleared the reclaim flag, then + * setting the reclaim flag here ends up + * interpreted as the readahead flag - but + * that does not matter enough to care. + * What we do want is for this folio to + * have the reclaim flag set next time + * memcg reclaim reaches the tests above, + * so it will then wait for writeback to + * avoid OOM; and it's also appropriate + * in global reclaim. */ - SetPageReclaim(page); - nr_writeback++; - - goto keep_locked; + folio_set_reclaim(folio); + stat->nr_writeback += nr_pages; + goto activate_locked; /* Case 3 above */ } else { - wait_on_page_writeback(page); + folio_unlock(folio); + folio_wait_writeback(folio); + /* then go back and try same folio again */ + list_add_tail(&folio->lru, folio_list); + continue; } } - if (!force_reclaim) - references = page_check_references(page, sc); + if (!ignore_references) + references = folio_check_references(folio, sc); switch (references) { - case PAGEREF_ACTIVATE: + case FOLIOREF_ACTIVATE: goto activate_locked; - case PAGEREF_KEEP: + case FOLIOREF_KEEP: + stat->nr_ref_keep += nr_pages; goto keep_locked; - case PAGEREF_RECLAIM: - case PAGEREF_RECLAIM_CLEAN: - ; /* try to reclaim the page below */ + case FOLIOREF_RECLAIM: + case FOLIOREF_RECLAIM_CLEAN: + ; /* try to reclaim the folio below */ + } + + /* + * Before reclaiming the folio, try to relocate + * its contents to another node. + */ + if (do_demote_pass && + (thp_migration_supported() || !folio_test_large(folio))) { + list_add(&folio->lru, &demote_folios); + folio_unlock(folio); + continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. + * Lazyfree folio could be freed directly */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (!add_to_swap(page, page_list)) - goto activate_locked; - may_enter_fs = 1; + if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { + if (!folio_test_swapcache(folio)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (folio_maybe_dma_pinned(folio)) + goto keep_locked; + if (folio_test_large(folio)) { + /* cannot split folio, skip it */ + if (!can_split_folio(folio, 1, NULL)) + goto activate_locked; + /* + * Split partially mapped folios right away. + * We can free the unmapped pages without IO. + */ + if (data_race(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) && + split_folio_to_list(folio, folio_list)) + goto activate_locked; + } + if (folio_alloc_swap(folio)) { + int __maybe_unused order = folio_order(folio); + + if (!folio_test_large(folio)) + goto activate_locked_split; + /* Fallback to swap normal pages */ + if (split_folio_to_list(folio, folio_list)) + goto activate_locked; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages >= HPAGE_PMD_NR) { + count_memcg_folio_events(folio, + THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } +#endif + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); + if (folio_alloc_swap(folio)) + goto activate_locked_split; + } + /* + * Normally the folio will be dirtied in unmap because its + * pte should be dirty. A special case is MADV_FREE page. The + * page's pte could have dirty bit cleared but the folio's + * SwapBacked flag is still set because clearing the dirty bit + * and SwapBacked flag has no lock protected. For such folio, + * unmap will not set dirty bit for it, so folio reclaim will + * not write the folio out. This can cause data corruption when + * the folio is swapped in later. Always setting the dirty flag + * for the folio solves the problem. + */ + folio_mark_dirty(folio); + } + } - /* Adding to swap updated mapping */ - mapping = page_mapping(page); + /* + * If the folio was split above, the tail pages will make + * their own pass through this function and be accounted + * then. + */ + if ((nr_pages > 1) && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; } /* - * The page is mapped into the page tables of one or more + * The folio is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { - case SWAP_FAIL: + if (folio_mapped(folio)) { + enum ttu_flags flags = TTU_BATCH_FLUSH; + bool was_swapbacked = folio_test_swapbacked(folio); + + if (folio_test_pmd_mappable(folio)) + flags |= TTU_SPLIT_HUGE_PMD; + /* + * Without TTU_SYNC, try_to_unmap will only begin to + * hold PTL from the first present PTE within a large + * folio. Some initial PTEs might be skipped due to + * races with parallel PTE writes in which PTEs can be + * cleared temporarily before being written new present + * values. This will lead to a large folio is still + * mapped while some subpages have been partially + * unmapped after try_to_unmap; TTU_SYNC helps + * try_to_unmap acquire PTL from the first PTE, + * eliminating the influence of temporary PTE values. + */ + if (folio_test_large(folio)) + flags |= TTU_SYNC; + + try_to_unmap(folio, flags); + if (folio_mapped(folio)) { + stat->nr_unmap_fail += nr_pages; + if (!was_swapbacked && + folio_test_swapbacked(folio)) + stat->nr_lazyfree_fail += nr_pages; goto activate_locked; - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_SUCCESS: - ; /* try to free the page below */ } } - if (PageDirty(page)) { - /* - * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but only writeback - * if many dirty pages have been encountered. - */ - if (page_is_file_cache(page) && - (!current_is_kswapd() || - !zone_is_reclaim_dirty(zone))) { + /* + * Folio is unmapped now so it cannot be newly pinned anymore. + * No point in trying to reclaim folio if it is pinned. + * Furthermore we don't want to reclaim underlying fs metadata + * if the folio is pinned and thus potentially modified by the + * pinning process as that may upset the filesystem. + */ + if (folio_maybe_dma_pinned(folio)) + goto activate_locked; + + mapping = folio_mapping(folio); + if (folio_test_dirty(folio)) { + if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. - * Similar in principal to deactivate_page() - * except we already have the page isolated + * Similar in principle to folio_deactivate() + * except we already have the folio isolated * and know it's dirty */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); + node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, + nr_pages); + if (!folio_test_reclaim(folio)) + folio_set_reclaim(folio); - goto keep_locked; + goto activate_locked; } - if (references == PAGEREF_RECLAIM_CLEAN) + if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; - if (!may_enter_fs) + if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; - /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sc)) { + /* + * Folio is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after I/O + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); + switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: + /* + * If shmem folio is split when writeback to swap, + * the tail pages will make their own pass through + * this function and be accounted then. + */ + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page)) + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } + stat->nr_pageout += nr_pages; + + if (folio_test_writeback(folio)) goto keep; - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto keep; /* * A synchronous write - probably a ramdisk. Go - * ahead and try to reclaim the page. + * ahead and try to reclaim the folio. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - if (PageDirty(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || + folio_test_writeback(folio)) goto keep_locked; - mapping = page_mapping(page); + mapping = folio_mapping(folio); + fallthrough; case PAGE_CLEAN: - ; /* try to free the page below */ + ; /* try to free the folio below */ } } /* - * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we try to free - * the page as well. + * If the folio has buffers, try to free the buffer + * mappings associated with this folio. If we succeed + * we try to free the folio as well. * - * We do this even if the page is PageDirty(). - * try_to_release_page() does not perform I/O, but it is - * possible for a page to have PageDirty set, but it is actually - * clean (all its buffers are clean). This happens if the - * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. - * try_to_release_page() will discover that cleanness and will - * drop the buffers and mark the page clean - it can be freed. + * We do this even if the folio is dirty. + * filemap_release_folio() does not perform I/O, but it + * is possible for a folio to have the dirty flag set, + * but it is actually clean (all its buffers are clean). + * This happens if the buffers were written out directly, + * with submit_bh(). ext3 will do this, as well as + * the blockdev mapping. filemap_release_folio() will + * discover that cleanness and will drop the buffers + * and mark the folio clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are - * the pages which were not successfully invalidated in - * truncate_complete_page(). We try to drop those buffers here - * and if that worked, and the page is no longer mapped into - * process address space (page_count == 1) it can be freed. - * Otherwise, leave the page on the LRU so it is swappable. + * Rarely, folios can have buffers and no ->mapping. + * These are the folios which were not successfully + * invalidated in truncate_cleanup_folio(). We try to + * drop those buffers here and if that worked, and the + * folio is no longer mapped into process address space + * (refcount == 1) it can be freed. Otherwise, leave + * the folio on the LRU so it is swappable. */ - if (page_has_private(page)) { - if (!try_to_release_page(page, sc->gfp_mask)) + if (folio_needs_release(folio)) { + if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) { - unlock_page(page); - if (put_page_testzero(page)) + if (!mapping && folio_ref_count(folio) == 1) { + folio_unlock(folio); + if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free - * this page shortly, so we may + * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ - nr_reclaimed++; + nr_reclaimed += nr_pages; continue; } } } - if (!mapping || !__remove_mapping(mapping, page)) + if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + /* follow __remove_mapping for reference */ + if (!folio_ref_freeze(folio, 1)) + goto keep_locked; + /* + * The folio has only one reference left, which is + * from the isolation. After the caller puts the + * folio back on the lru and drops the reference, the + * folio will be freed anyway. It doesn't matter + * which lru it goes on. So we don't bother checking + * the dirty flag here. + */ + count_vm_events(PGLAZYFREED, nr_pages); + count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); + } else if (!mapping || !__remove_mapping(mapping, folio, true, + sc->target_mem_cgroup)) goto keep_locked; - /* - * At this point, we have no other references and there is - * no way to pick any more up (removed from LRU, removed - * from pagecache). Can use non-atomic bitops now (and - * we obviously don't have to worry about waking up a process - * waiting on the page lock, because there are no references. - */ - __clear_page_locked(page); + folio_unlock(folio); free_it: - nr_reclaimed++; - /* - * Is there need to periodically free_page_list? It would - * appear not as the counts should be low + * Folio may get swapped out as a whole, need to account + * all pages in it. */ - list_add(&page->lru, &free_pages); - continue; + nr_reclaimed += nr_pages; -cull_mlocked: - if (PageSwapCache(page)) - try_to_free_swap(page); - unlock_page(page); - putback_lru_page(page); + folio_unqueue_deferred_split(folio); + if (folio_batch_add(&free_folios, folio) == 0) { + mem_cgroup_uncharge_folios(&free_folios); + try_to_unmap_flush(); + free_unref_folios(&free_folios); + } continue; +activate_locked_split: + /* + * The tail pages that are failed to add into swap cache + * reach here. Fixup nr_scanned and nr_pages. + */ + if (nr_pages > 1) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) - try_to_free_swap(page); - VM_BUG_ON(PageActive(page)); - SetPageActive(page); - pgactivate++; + if (folio_test_swapcache(folio) && + (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) + folio_free_swap(folio); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); + if (!folio_test_mlocked(folio)) { + int type = folio_is_file_lru(folio); + folio_set_active(folio); + stat->nr_activate[type] += nr_pages; + count_memcg_folio_events(folio, PGACTIVATE, nr_pages); + } keep_locked: - unlock_page(page); + folio_unlock(folio); keep: - list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + list_add(&folio->lru, &ret_folios); + VM_BUG_ON_FOLIO(folio_test_lru(folio) || + folio_test_unevictable(folio), folio); + } + /* 'folio_list' is always empty here */ + + /* Migrate folios selected for demotion */ + nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_reclaimed += nr_demoted; + stat->nr_demoted += nr_demoted; + /* Folios that could not be demoted are still in @demote_folios */ + if (!list_empty(&demote_folios)) { + /* Folios which weren't demoted go back on @folio_list */ + list_splice_init(&demote_folios, folio_list); + + /* + * goto retry to reclaim the undemoted folios in folio_list if + * desired. + * + * Reclaiming directly from top tier nodes is not often desired + * due to it breaking the LRU ordering: in general memory + * should be reclaimed from lower tier nodes and demoted from + * top tier nodes. + * + * However, disabling reclaim from top tier nodes entirely + * would cause ooms in edge scenarios where lower tier memory + * is unreclaimable for whatever reason, eg memory being + * mlocked or too hot to reclaim. We can disable reclaim + * from top tier nodes in proactive reclaim though as that is + * not real memory pressure. + */ + if (!sc->proactive) { + do_demote_pass = false; + goto retry; + } } - free_hot_cold_page_list(&free_pages, 1); + pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - list_splice(&ret_pages, page_list); + mem_cgroup_uncharge_folios(&free_folios); + try_to_unmap_flush(); + free_unref_folios(&free_folios); + + list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); - mem_cgroup_uncharge_end(); - *ret_nr_dirty += nr_dirty; - *ret_nr_congested += nr_congested; - *ret_nr_unqueued_dirty += nr_unqueued_dirty; - *ret_nr_writeback += nr_writeback; - *ret_nr_immediate += nr_immediate; + + if (plug) + swap_write_unplug(plug); return nr_reclaimed; } -unsigned long reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *page_list) +unsigned int reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; - struct page *page, *next; - LIST_HEAD(clean_pages); - - list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page)) { - ClearPageActive(page); - list_move(&page->lru, &clean_pages); + struct reclaim_stat stat; + unsigned int nr_reclaimed; + struct folio *folio, *next; + LIST_HEAD(clean_folios); + unsigned int noreclaim_flag; + + list_for_each_entry_safe(folio, next, folio_list, lru) { + /* TODO: these pages should not even appear in this list. */ + if (page_has_movable_ops(&folio->page)) + continue; + if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && + !folio_test_dirty(folio) && !folio_test_unevictable(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &clean_folios); } } - ret = shrink_page_list(&clean_pages, zone, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); - list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); - return ret; + /* + * We should be safe here since we are only dealing with file pages and + * we are not kswapd and therefore cannot write dirty file pages. But + * call memalloc_noreclaim_save() anyway, just in case these conditions + * change in the future. + */ + noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, + &stat, true, NULL); + memalloc_noreclaim_restore(noreclaim_flag); + + list_splice(&clean_folios, folio_list); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -(long)nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -(long)stat.nr_lazyfree_fail); + return nr_reclaimed; } /* - * Attempt to remove the specified page from its LRU. Only take this page - * if it is of the appropriate PageActive status. Pages which are being - * freed elsewhere are also ignored. - * - * page: page to consider - * mode: one of the LRU isolation modes defined above - * - * returns 0 on success, -ve errno on failure. + * Update LRU sizes after isolating pages. The LRU size updates must + * be complete before mem_cgroup_update_lru_size due to a sanity check. */ -int __isolate_lru_page(struct page *page, isolate_mode_t mode) +static __always_inline void update_lru_sizes(struct lruvec *lruvec, + enum lru_list lru, unsigned long *nr_zone_taken) { - int ret = -EINVAL; - - /* Only take pages on the LRU. */ - if (!PageLRU(page)) - return ret; - - /* Compaction should not handle unevictable pages but CMA can do so */ - if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) - return ret; + int zid; - ret = -EBUSY; - - /* - * To minimise LRU disruption, the caller can indicate that it only - * wants to isolate pages it will be able to operate on without - * blocking - clean pages for the most part. - * - * ISOLATE_CLEAN means that only clean pages should be isolated. This - * is used by reclaim when it is cannot write to backing storage - * - * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages - * that it is possible to migrate without blocking - */ - if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { - /* All the caller can do on PageWriteback is block */ - if (PageWriteback(page)) - return ret; - - if (PageDirty(page)) { - struct address_space *mapping; - - /* ISOLATE_CLEAN means only clean pages */ - if (mode & ISOLATE_CLEAN) - return ret; - - /* - * Only pages without mappings or that have a - * ->migratepage callback are possible to migrate - * without blocking - */ - mapping = page_mapping(page); - if (mapping && !mapping->a_ops->migratepage) - return ret; - } - } - - if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) - return ret; + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_zone_taken[zid]) + continue; - if (likely(get_page_unless_zero(page))) { - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - ClearPageLRU(page); - ret = 0; + update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } - return ret; } /* - * zone->lru_lock is heavily contended. Some of the functions that + * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. + * + * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * - * Appropriate locks must be held before calling this function. + * Lru_lock must be held before calling this function. * - * @nr_to_scan: The number of pages to look through on the list. + * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session - * @mode: One of the LRU isolation modes * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, +static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, - isolate_mode_t mode, enum lru_list lru) + enum lru_list lru) { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long scan; + unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; + unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long skipped = 0, total_scan = 0, scan = 0; + unsigned long nr_pages; + unsigned long max_nr_skipped = 0; + LIST_HEAD(folios_skipped); + + while (scan < nr_to_scan && !list_empty(src)) { + struct list_head *move_to = src; + struct folio *folio; + + folio = lru_to_folio(src); + prefetchw_prev_lru_folio(folio, src, flags); + + nr_pages = folio_nr_pages(folio); + total_scan += nr_pages; + + /* Using max_nr_skipped to prevent hard LOCKUP*/ + if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED && + (folio_zonenum(folio) > sc->reclaim_idx)) { + nr_skipped[folio_zonenum(folio)] += nr_pages; + move_to = &folios_skipped; + max_nr_skipped++; + goto move; + } - for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { - struct page *page; - int nr_pages; + /* + * Do not count skipped folios because that makes the function + * return with no isolated folios if the LRU mostly contains + * ineligible folios. This causes the VM to not reclaim any + * folios, triggering a premature OOM. + * Account all pages in a folio. + */ + scan += nr_pages; - page = lru_to_page(src); - prefetchw_prev_lru_page(page, src, flags); + if (!folio_test_lru(folio)) + goto move; + if (!sc->may_unmap && folio_mapped(folio)) + goto move; - VM_BUG_ON(!PageLRU(page)); + /* + * Be careful not to clear the lru flag until after we're + * sure the folio is not being freed elsewhere -- the + * folio release code relies on it. + */ + if (unlikely(!folio_try_get(folio))) + goto move; - switch (__isolate_lru_page(page, mode)) { - case 0: - nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); - list_move(&page->lru, dst); - nr_taken += nr_pages; - break; + if (!folio_test_clear_lru(folio)) { + /* Another thread is already isolating this folio */ + folio_put(folio); + goto move; + } - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&page->lru, src); - continue; + nr_taken += nr_pages; + nr_zone_taken[folio_zonenum(folio)] += nr_pages; + move_to = dst; +move: + list_move(&folio->lru, move_to); + } - default: - BUG(); + /* + * Splice any skipped folios to the start of the LRU list. Note that + * this disrupts the LRU order when reclaiming for lower zones but + * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX + * scanning would soon rescan the same folios to skip and waste lots + * of cpu cycles. + */ + if (!list_empty(&folios_skipped)) { + int zid; + + list_splice(&folios_skipped, src); + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_skipped[zid]) + continue; + + __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); + skipped += nr_skipped[zid]; } } - - *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, - nr_taken, mode, is_file_lru(lru)); + *nr_scanned = total_scan; + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, + total_scan, skipped, nr_taken, lru); + update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } /** - * isolate_lru_page - tries to isolate a page from its LRU list - * @page: page to isolate from its LRU list + * folio_isolate_lru() - Try to isolate a folio from its LRU list. + * @folio: Folio to isolate from its LRU list. * - * Isolates a @page from an LRU list, clears PageLRU and adjusts the - * vmstat statistic corresponding to whatever LRU list the page was on. + * Isolate a @folio from an LRU list and adjust the vmstat statistic + * corresponding to whatever LRU list the folio was on. * - * Returns 0 if the page was removed from an LRU list. - * Returns -EBUSY if the page was not on an LRU list. - * - * The returned page will have PageLRU() cleared. If it was found on - * the active list, it will have PageActive set. If it was found on - * the unevictable list, it will have the PageUnevictable bit set. That flag + * The folio will have its LRU flag cleared. If it was found on the + * active list, it will have the Active flag set. If it was found on the + * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * - * The vmstat statistic corresponding to the list on which the page was - * found will be decremented. + * Context: * - * Restrictions: - * (1) Must be called with an elevated refcount on the page. This is a - * fundamentnal difference from isolate_lru_pages (which is called + * (1) Must be called with an elevated refcount on the folio. This is a + * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). - * (2) the lru_lock must not be held. - * (3) interrupts must be enabled. + * (2) The lru_lock must not be held. + * (3) Interrupts must be enabled. + * + * Return: true if the folio was removed from an LRU list. + * false if the folio was not on an LRU list. */ -int isolate_lru_page(struct page *page) +bool folio_isolate_lru(struct folio *folio) { - int ret = -EBUSY; + bool ret = false; - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); - if (PageLRU(page)) { - struct zone *zone = page_zone(page); + if (folio_test_clear_lru(folio)) { struct lruvec *lruvec; - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); - if (PageLRU(page)) { - int lru = page_lru(page); - get_page(page); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, lru); - ret = 0; - } - spin_unlock_irq(&zone->lru_lock); + folio_get(folio); + lruvec = folio_lruvec_lock_irq(folio); + lruvec_del_folio(lruvec, folio); + unlock_page_lruvec_irq(lruvec); + ret = true; } + return ret; } /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and - * then get resheduled. When there are massive number of tasks doing page + * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct zone *zone, int file, +static bool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; + bool too_many; if (current_is_kswapd()) - return 0; + return false; - if (!global_reclaim(sc)) - return 0; + if (!writeback_throttling_sane(sc)) + return false; if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* @@ -1290,90 +1867,126 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if (gfp_has_io_fs(sc->gfp_mask)) inactive >>= 3; - return isolated > inactive; + too_many = isolated > inactive; + + /* Wake up tasks throttled due to too_many_isolated. */ + if (!too_many) + wake_throttle_isolated(pgdat); + + return too_many; } -static noinline_for_stack void -putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) +/* + * move_folios_to_lru() moves folios from private @list to appropriate LRU list. + * + * Returns the number of pages moved to the given lruvec. + */ +static unsigned int move_folios_to_lru(struct lruvec *lruvec, + struct list_head *list) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - struct zone *zone = lruvec_zone(lruvec); - LIST_HEAD(pages_to_free); - - /* - * Put back any unfreeable pages. - */ - while (!list_empty(page_list)) { - struct page *page = lru_to_page(page_list); - int lru; + int nr_pages, nr_moved = 0; + struct folio_batch free_folios; - VM_BUG_ON(PageLRU(page)); - list_del(&page->lru); - if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&zone->lru_lock); - putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); + folio_batch_init(&free_folios); + while (!list_empty(list)) { + struct folio *folio = lru_to_folio(list); + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + list_del(&folio->lru); + if (unlikely(!folio_evictable(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + folio_putback_lru(folio); + spin_lock_irq(&lruvec->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, zone); + /* + * The folio_set_lru needs to be kept here for list integrity. + * Otherwise: + * #0 move_folios_to_lru #1 release_pages + * if (!folio_put_testzero()) + * if (folio_put_testzero()) + * !lru //skip lru_lock + * folio_set_lru() + * list_add(&folio->lru,) + * list_add(&folio->lru,) + */ + folio_set_lru(folio); + + if (unlikely(folio_put_testzero(folio))) { + __folio_clear_lru_flags(folio); - SetPageLRU(page); - lru = page_lru(page); - add_page_to_lru_list(page, lruvec, lru); + folio_unqueue_deferred_split(folio); + if (folio_batch_add(&free_folios, folio) == 0) { + spin_unlock_irq(&lruvec->lru_lock); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); + spin_lock_irq(&lruvec->lru_lock); + } - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - int numpages = hpage_nr_pages(page); - reclaim_stat->recent_rotated[file] += numpages; + continue; } - if (put_page_testzero(page)) { - __ClearPageLRU(page); - __ClearPageActive(page); - del_page_from_lru_list(page, lruvec, lru); - if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); - (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); - } else - list_add(&page->lru, &pages_to_free); - } + /* + * All pages were isolated from the same lruvec (and isolation + * inhibits memcg migration). + */ + VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + lruvec_add_folio(lruvec, folio); + nr_pages = folio_nr_pages(folio); + nr_moved += nr_pages; + if (folio_test_active(folio)) + workingset_age_nonresident(lruvec, nr_pages); } - /* - * To save our caller's stack, now use input list for pages to free. - */ - list_splice(&pages_to_free, page_list); + if (free_folios.nr) { + spin_unlock_irq(&lruvec->lru_lock); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); + spin_lock_irq(&lruvec->lru_lock); + } + + return nr_moved; } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * If a kernel thread (such as nfsd for loop-back mounts) services a backing + * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case + * we should not throttle. Otherwise it is safe to do so. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LOCAL_THROTTLE); +} + +/* + * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) +static unsigned long shrink_inactive_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, + enum lru_list lru) { - LIST_HEAD(page_list); + LIST_HEAD(folio_list); unsigned long nr_scanned; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; - isolate_mode_t isolate_mode = 0; - int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - - while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + struct reclaim_stat stat; + bool file = is_file_lru(lru); + enum vm_event_item item; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + bool stalled = false; + + while (unlikely(too_many_isolated(pgdat, file, sc))) { + if (stalled) + return 0; + + /* wait a bit for the reclaimer. */ + stalled = true; + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) @@ -1382,180 +1995,98 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; + spin_lock_irq(&lruvec->lru_lock); - spin_lock_irq(&zone->lru_lock); + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, + &nr_scanned, sc, lru); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, - &nr_scanned, sc, isolate_mode, lru); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); + item = PGSCAN_KSWAPD + reclaimer_offset(sc); + if (!cgroup_reclaim(sc)) + __count_vm_events(item, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + __count_vm_events(PGSCAN_ANON + file, nr_scanned); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); - - if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; - if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); - else - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); - } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_dirty, &nr_unqueued_dirty, &nr_congested, - &nr_writeback, &nr_immediate, - false); - - spin_lock_irq(&zone->lru_lock); - - reclaim_stat->recent_scanned[file] += nr_taken; - - if (global_reclaim(sc)) { - if (current_is_kswapd()) - __count_zone_vm_events(PGSTEAL_KSWAPD, zone, - nr_reclaimed); - else - __count_zone_vm_events(PGSTEAL_DIRECT, zone, - nr_reclaimed); - } + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, + lruvec_memcg(lruvec)); - putback_inactive_pages(lruvec, &page_list); + spin_lock_irq(&lruvec->lru_lock); + move_folios_to_lru(lruvec, &folio_list); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + stat.nr_demoted); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); + if (!cgroup_reclaim(sc)) + __count_vm_events(item, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&zone->lru_lock); - - free_hot_cold_page_list(&page_list, 1); + lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, + nr_scanned - nr_reclaimed); /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number - * of pages under pages flagged for immediate reclaim and stall if any - * are encountered in the nr_immediate check below. + * If dirty folios are scanned that are not queued for IO, it + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty folios to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty folios grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep. */ - if (nr_writeback && nr_writeback == nr_taken) - zone_set_flag(zone, ZONE_WRITEBACK); - - /* - * memcg will stall in page writeback so only consider forcibly - * stalling for global reclaim - */ - if (global_reclaim(sc)) { - /* - * Tag a zone as congested if all the dirty pages scanned were - * backed by a congested BDI and wait_iff_congested will stall. - */ - if (nr_dirty && nr_dirty == nr_congested) - zone_set_flag(zone, ZONE_CONGESTED); - + if (stat.nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(WB_REASON_VMSCAN); /* - * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. It will forcibly stall in the - * next check. - */ - if (nr_unqueued_dirty == nr_taken) - zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); - - /* - * In addition, if kswapd scans pages marked marked for - * immediate reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU faster than - * they are written so also forcibly stall. + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. */ - if (nr_unqueued_dirty == nr_taken || nr_immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } - /* - * Stall direct reclaim for IO completions if underlying BDIs or zone - * is congested. Allow kswapd to continue until it starts encountering - * unqueued dirty pages or cycling through the LRU too quickly. - */ - if (!sc->hibernation_mode && !current_is_kswapd()) - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); - - trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, - zone_idx(zone), - nr_scanned, nr_reclaimed, - sc->priority, - trace_shrink_flags(file)); + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; + + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } /* - * This moves pages from the active list to the inactive list. + * shrink_active_list() moves folios from the active LRU to the inactive LRU. * - * We move them the other way if the page is referenced by one or more - * processes, from rmap. + * We move them the other way if the folio is referenced by one or more + * processes. * - * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone->lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone->lru_lock around each page. It's impossible to balance - * this, so instead we remove the pages from the LRU while processing them. - * It is safe to rely on PG_active against the non-LRU pages in here because - * nobody will play with that bit on a non-LRU page. + * If the folios are mostly unmapped, the processing is fast and it is + * appropriate to hold lru_lock across the whole operation. But if + * the folios are mapped, the processing is slow (folio_referenced()), so + * we should drop lru_lock around each folio. It's impossible to balance + * this, so instead we remove the folios from the LRU while processing them. + * It is safe to rely on the active flag against the non-LRU folios in here + * because nobody will play with that bit on a non-LRU folio. * - * The downside is that we have to touch page->_count against each page. - * But we had to alter page->flags anyway. + * The downside is that we have to touch folio->_refcount against each folio. + * But we had to alter folio->flags anyway. */ - -static void move_active_pages_to_lru(struct lruvec *lruvec, - struct list_head *list, - struct list_head *pages_to_free, - enum lru_list lru) -{ - struct zone *zone = lruvec_zone(lruvec); - unsigned long pgmoved = 0; - struct page *page; - int nr_pages; - - while (!list_empty(list)) { - page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, zone); - - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - - nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); - list_move(&page->lru, &lruvec->lists[lru]); - pgmoved += nr_pages; - - if (put_page_testzero(page)) { - __ClearPageLRU(page); - __ClearPageActive(page); - del_page_from_lru_list(page, lruvec, lru); - - if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); - (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); - } else - list_add(&page->lru, pages_to_free); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - if (!is_active_lru(lru)) - __count_vm_events(PGDEACTIVATE, pgmoved); -} - static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, @@ -1563,190 +2094,209 @@ static void shrink_active_list(unsigned long nr_to_scan, { unsigned long nr_taken; unsigned long nr_scanned; - unsigned long vm_flags; - LIST_HEAD(l_hold); /* The pages which were snipped off */ + vm_flags_t vm_flags; + LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); - struct page *page; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - unsigned long nr_rotated = 0; - isolate_mode_t isolate_mode = 0; - int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + unsigned nr_deactivate, nr_activate; + unsigned nr_rotated = 0; + bool file = is_file_lru(lru); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); - if (!sc->may_unmap) - isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; + spin_lock_irq(&lruvec->lru_lock); - spin_lock_irq(&zone->lru_lock); + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, lru); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, - &nr_scanned, sc, isolate_mode, lru); - if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; + if (!cgroup_reclaim(sc)) + __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); - __count_zone_vm_events(PGREFILL, zone, nr_scanned); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { + struct folio *folio; + cond_resched(); - page = lru_to_page(&l_hold); - list_del(&page->lru); + folio = lru_to_folio(&l_hold); + list_del(&folio->lru); - if (unlikely(!page_evictable(page))) { - putback_lru_page(page); + if (unlikely(!folio_evictable(folio))) { + folio_putback_lru(folio); continue; } if (unlikely(buffer_heads_over_limit)) { - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); + if (folio_needs_release(folio) && + folio_trylock(folio)) { + filemap_release_folio(folio, 0); + folio_unlock(folio); } } - if (page_referenced(page, 0, sc->target_mem_cgroup, - &vm_flags)) { - nr_rotated += hpage_nr_pages(page); + /* Referenced or rmap lock contention: rotate */ + if (folio_referenced(folio, 0, sc->target_mem_cgroup, + &vm_flags) != 0) { /* - * Identify referenced, file-backed active pages and + * Identify referenced, file-backed active folios and * give them one more trip around the active list. So * that executable code get better chances to stay in - * memory under moderate memory pressure. Anon pages + * memory under moderate memory pressure. Anon folios * are not likely to be evicted by use-once streaming - * IO, plus JVM can create lots of anon VM_EXEC pages, + * IO, plus JVM can create lots of anon VM_EXEC folios, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { - list_add(&page->lru, &l_active); + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { + nr_rotated += folio_nr_pages(folio); + list_add(&folio->lru, &l_active); continue; } } - ClearPageActive(page); /* we are de-activating */ - list_add(&page->lru, &l_inactive); + folio_clear_active(folio); /* we are de-activating */ + folio_set_workingset(folio); + list_add(&folio->lru, &l_inactive); } /* - * Move pages back to the lru list. - */ - spin_lock_irq(&zone->lru_lock); - /* - * Count referenced pages from currently used mappings as rotated, - * even though only some of them are actually re-activated. This - * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * Move folios back to the lru list. */ - reclaim_stat->recent_rotated[file] += nr_rotated; + spin_lock_irq(&lruvec->lru_lock); + + nr_activate = move_folios_to_lru(lruvec, &l_active); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); + + __count_vm_events(PGDEACTIVATE, nr_deactivate); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); - move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); - move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&zone->lru_lock); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - free_hot_cold_page_list(&l_hold, 1); + lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); + trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, + nr_deactivate, nr_rotated, sc->priority, file); } -#ifdef CONFIG_SWAP -static int inactive_anon_is_low_global(struct zone *zone) +static unsigned int reclaim_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat) { - unsigned long active, inactive; - - active = zone_page_state(zone, NR_ACTIVE_ANON); - inactive = zone_page_state(zone, NR_INACTIVE_ANON); + struct reclaim_stat stat; + unsigned int nr_reclaimed; + struct folio *folio; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .no_demotion = 1, + }; - if (inactive * zone->inactive_ratio < active) - return 1; + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL); + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat); - return 0; + return nr_reclaimed; } -/** - * inactive_anon_is_low - check if anonymous pages need to be deactivated - * @lruvec: LRU vector to check - * - * Returns true if the zone does not have enough inactive anon pages, - * meaning some active anon pages need to be deactivated. - */ -static int inactive_anon_is_low(struct lruvec *lruvec) +unsigned long reclaim_pages(struct list_head *folio_list) { - /* - * If we don't have swap space, anonymous page deactivation - * is pointless. - */ - if (!total_swap_pages) - return 0; + int nid; + unsigned int nr_reclaimed = 0; + LIST_HEAD(node_folio_list); + unsigned int noreclaim_flag; - if (!mem_cgroup_disabled()) - return mem_cgroup_inactive_anon_is_low(lruvec); + if (list_empty(folio_list)) + return nr_reclaimed; - return inactive_anon_is_low_global(lruvec_zone(lruvec)); -} -#else -static inline int inactive_anon_is_low(struct lruvec *lruvec) -{ - return 0; -} -#endif + noreclaim_flag = memalloc_noreclaim_save(); -/** - * inactive_file_is_low - check if file pages need to be deactivated - * @lruvec: LRU vector to check - * - * When the system is doing streaming IO, memory pressure here - * ensures that active file pages get deactivated, until more - * than half of the file pages are on the inactive list. - * - * Once we get to that situation, protect the system's working - * set from being evicted by disabling active file page aging. - * - * This uses a different ratio than the anonymous pages, because - * the page cache uses a use-once replacement algorithm. - */ -static int inactive_file_is_low(struct lruvec *lruvec) -{ - unsigned long inactive; - unsigned long active; + nid = folio_nid(lru_to_folio(folio_list)); + do { + struct folio *folio = lru_to_folio(folio_list); - inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + if (nid == folio_nid(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &node_folio_list); + continue; + } - return active > inactive; -} + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nid = folio_nid(lru_to_folio(folio_list)); + } while (!list_empty(folio_list)); -static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) -{ - if (is_file_lru(lru)) - return inactive_file_is_low(lruvec); - else - return inactive_anon_is_low(lruvec); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + + memalloc_noreclaim_restore(noreclaim_flag); + + return nr_reclaimed; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, lru)) + if (sc->may_deactivate & (1 << is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; return 0; } return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct scan_control *sc) +/* + * The inactive anon list should be small enough that the VM never has + * to do too much work. + * + * The inactive file list should be small enough to leave most memory + * to the established workingset on the scan-resistant active list, + * but large enough to avoid thrashing the aggregate readahead window. + * + * Both inactive lists should also be large enough that each inactive + * folio has a chance to be referenced again before it is reclaimed. + * + * If that fails and refaulting is observed, the inactive list grows. + * + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios + * on this LRU, maintained by the pageout code. An inactive_ratio + * of 3 means 3:1 or 25% of the folios are kept on the inactive list. + * + * total target max + * memory ratio inactive + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { - if (global_reclaim(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->target_mem_cgroup); + enum lru_list active_lru = inactive_lru + LRU_ACTIVE; + unsigned long inactive, active; + unsigned long inactive_ratio; + unsigned long gb; + + inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); + active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); + + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; + + return inactive * inactive_ratio < active; } enum scan_balance { @@ -1756,46 +2306,230 @@ enum scan_balance { SCAN_FILE, }; +static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long file; + struct lruvec *target_lruvec; + + if (lru_gen_enabled()) + return; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* + * Flush the memory cgroup stats in rate-limited way as we don't need + * most accurate stats here. We may switch to regular stats flushing + * in the future once it is cheap enough. + */ + mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup); + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&target_lruvec->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) && + !sc->no_cache_trim_mode) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + struct zone *zone; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } +} + +static inline void calculate_pressure_balance(struct scan_control *sc, + int swappiness, u64 *fraction, u64 *denominator) +{ + unsigned long anon_cost, file_cost, total_cost; + unsigned long ap, fp; + + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[WORKINGSET_ANON] = ap; + fraction[WORKINGSET_FILE] = fp; + *denominator = ap + fp; +} + +static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long scan) +{ + unsigned long min, low; + + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + + if (min || low) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan -= scan * protection / (cgroup_size + 1); + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decrementing + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } + return scan; +} + /* * Determine how aggressively the anon and file LRU lists should be - * scanned. The relative value of each set of LRU lists is determined - * by looking at the fraction of the pages scanned we did rotate back - * onto the active list instead of evict. + * scanned. * - * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan - * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan + * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan + * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - u64 fraction[2]; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + int swappiness = sc_swappiness(sc, memcg); + u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ - struct zone *zone = lruvec_zone(lruvec); - unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file, free; - bool force_scan = false; - unsigned long ap, fp; enum lru_list lru; - /* - * If the zone or memcg is small, nr[l] can be 0. This - * results in no scanning on this priority and a potential - * priority drop. Global direct reclaim can go to the next - * zone and tends to have no problems. Global kswapd is for - * zone balancing and it needs to scan a minimum amount. When - * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount there as - * well. - */ - if (current_is_kswapd() && zone->all_unreclaimable) - force_scan = true; - if (!global_reclaim(sc)) - force_scan = true; - - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + /* If we have no swap space, do not bother scanning anon folios. */ + if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } @@ -1807,106 +2541,65 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + if (cgroup_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } + /* Proactive reclaim initiated by userspace for anonymous memory only */ + if (swappiness == SWAPPINESS_ANON_ONLY) { + WARN_ON_ONCE(!sc->proactive); + scan_balance = SCAN_ANON; + goto out; + } + /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && vmscan_swappiness(sc)) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* - * If it's foreseeable that reclaiming the file cache won't be - * enough to get the zone back into a desirable shape, we have - * to swap. Better start now and leave the - probably heavily - * thrashing - remaining file pages alone. + * If the system is almost out of file pages, force-scan anon. */ - if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); - if (unlikely(file + free <= high_wmark_pages(zone))) { - scan_balance = SCAN_ANON; - goto out; - } + if (sc->file_is_tiny) { + scan_balance = SCAN_ANON; + goto out; } /* - * There is enough inactive page cache, do not reclaim - * anything from the anonymous working set right now. + * If there is enough inactive page cache, we do not reclaim + * anything from the anonymous working right now to make sure + * a streaming file access pattern doesn't cause swapping. */ - if (!inactive_file_is_low(lruvec)) { + if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } scan_balance = SCAN_FRACT; + calculate_pressure_balance(sc, swappiness, fraction, &denominator); - /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = vmscan_swappiness(sc); - file_prio = 200 - anon_prio; - - /* - * OK, so we have swap space and a fair amount of page cache - * pages. We use the recently rotated / recently scanned - * ratios to determine how valuable each cache is. - * - * Because workloads change over time (and to avoid overflow) - * we keep these statistics as a floating average, which ends - * up weighing recent references more than old ones. - * - * anon in [0], file in [1] - */ - spin_lock_irq(&zone->lru_lock); - if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - reclaim_stat->recent_scanned[0] /= 2; - reclaim_stat->recent_rotated[0] /= 2; - } - - if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - reclaim_stat->recent_scanned[1] /= 2; - reclaim_stat->recent_rotated[1] /= 2; - } - - /* - * The amount of pressure on anon vs file pages is inversely - * proportional to the fraction of recently scanned pages on - * each list that were recently referenced and in active use. - */ - ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); - ap /= reclaim_stat->recent_rotated[0] + 1; - - fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); - fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); - - fraction[0] = ap; - fraction[1] = fp; - denominator = ap + fp + 1; out: for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; + bool file = is_file_lru(lru); + unsigned long lruvec_size; unsigned long scan; - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = apply_proportional_protection(memcg, sc, lruvec_size); + scan >>= sc->priority; - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); + /* + * If the cgroup's already been deleted, make sure to + * scrape out the remaining cache. + */ + if (!scan && !mem_cgroup_online(memcg)) + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -1916,8 +2609,14 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. */ - scan = div64_u64(scan * fraction[file], denominator); + scan = mem_cgroup_online(memcg) ? + div64_u64(scan * fraction[file], denominator) : + DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: @@ -1929,13 +2628,3145 @@ out: /* Look ma, no brain */ BUG(); } + nr[lru] = scan; } } /* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + * Anonymous LRU management is a waste if there is + * ultimately no way to reclaim the memory. + */ +static bool can_age_anon_pages(struct lruvec *lruvec, + struct scan_control *sc) +{ + /* Aging the anon LRU is valuable if swap is present: */ + if (total_swap_pages > 0) + return true; + + /* Also valuable if anon pages can be demoted: */ + return can_demote(lruvec_pgdat(lruvec)->node_id, sc, + lruvec_memcg(lruvec)); +} + +#ifdef CONFIG_LRU_GEN + +#ifdef CONFIG_LRU_GEN_ENABLED +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) +#else +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) +#endif + +static bool should_walk_mmu(void) +{ + return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK); +} + +static bool should_clear_pmd_young(void) +{ + return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG); +} + +/****************************************************************************** + * shorthand helpers + ******************************************************************************/ + +#define DEFINE_MAX_SEQ(lruvec) \ + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) + +#define DEFINE_MIN_SEQ(lruvec) \ + unsigned long min_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ + } + +/* Get the min/max evictable type based on swappiness */ +#define min_type(swappiness) (!(swappiness)) +#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY) + +#define evictable_min_seq(min_seq, swappiness) \ + min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)]) + +#define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + +#define for_each_evictable_type(type, swappiness) \ + for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++) + +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) + +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + +#ifdef CONFIG_MEMCG + if (memcg) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* see the comment in mem_cgroup_lruvec() */ + if (!lruvec->pgdat) + lruvec->pgdat = pgdat; + + return lruvec; + } +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return &pgdat->__lruvec; +} + +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) +{ + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!sc->may_swap) + return 0; + + if (!can_demote(pgdat->node_id, sc, memcg) && + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) + return 0; + + return sc_swappiness(sc, memcg); +} + +static int get_nr_gens(struct lruvec *lruvec, int type) +{ + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; +} + +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) +{ + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + int n = get_nr_gens(lruvec, type); + + if (n < MIN_NR_GENS || n > MAX_NR_GENS) + return false; + } + + return true; +} + +/****************************************************************************** + * Bloom filters + ******************************************************************************/ + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(mm_state->filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(mm_state->filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = mm_state->filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(mm_state->filters[gen], filter); +} + +/****************************************************************************** + * mm_struct list + ******************************************************************************/ + +#ifdef CONFIG_LRU_GEN_WALKS_MMU + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + static struct lru_gen_mm_list mm_list = { + .fifo = LIST_HEAD_INIT(mm_list.fifo), + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), + }; + +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->mm_list; +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return &mm_list; +} + +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return &lruvec->mm_state; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + int key; + struct mm_struct *mm; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); + + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return NULL; + + clear_bit(key, &mm->lru_gen.bitmap); + + return mmget_not_zero(mm) ? mm : NULL; +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + int nid; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); +#ifdef CONFIG_MEMCG + VM_WARN_ON_ONCE(mm->lru_gen.memcg); + mm->lru_gen.memcg = memcg; +#endif + spin_lock(&mm_list->lock); + + for_each_node_state(nid, N_MEMORY) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + /* the first addition since the last iteration */ + if (mm_state->tail == &mm_list->fifo) + mm_state->tail = &mm->lru_gen.list; + } + + list_add_tail(&mm->lru_gen.list, &mm_list->fifo); + + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; + struct lru_gen_mm_list *mm_list; + struct mem_cgroup *memcg = NULL; + + if (list_empty(&mm->lru_gen.list)) + return; + +#ifdef CONFIG_MEMCG + memcg = mm->lru_gen.memcg; +#endif + mm_list = get_mm_list(memcg); + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + /* where the current iteration continues after */ + if (mm_state->head == &mm->lru_gen.list) + mm_state->head = mm_state->head->prev; + + /* where the last iteration ended before */ + if (mm_state->tail == &mm->lru_gen.list) + mm_state->tail = mm_state->tail->next; + } + + list_del_init(&mm->lru_gen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lru_gen.memcg); + mm->lru_gen.memcg = NULL; +#endif +} + +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + struct task_struct *task = rcu_dereference_protected(mm->owner, true); + + VM_WARN_ON_ONCE(task->mm != mm); + lockdep_assert_held(&task->alloc_lock); + + /* for mm_update_next_owner() */ + if (mem_cgroup_disabled()) + return; + + /* migration can happen before addition */ + if (!mm->lru_gen.memcg) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(task); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} +#endif + +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + return NULL; +} + +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return NULL; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + return NULL; +} + +#endif + +static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) +{ + int i; + int hist; + struct lruvec *lruvec = walk->lruvec; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + hist = lru_hist_from_seq(walk->seq); + + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(mm_state->stats[hist][i], + mm_state->stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; + } + + if (NR_HIST_GENS > 1 && last) { + hist = lru_hist_from_seq(walk->seq + 1); + + for (i = 0; i < NR_MM_STATS; i++) + WRITE_ONCE(mm_state->stats[hist][i], 0); + } +} + +static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter) +{ + bool first = false; + bool last = false; + struct mm_struct *mm = NULL; + struct lruvec *lruvec = walk->lruvec; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + /* + * mm_state->seq is incremented after each iteration of mm_list. There + * are three interesting cases for this page table walker: + * 1. It tries to start a new iteration with a stale max_seq: there is + * nothing left to do. + * 2. It started the next iteration: it needs to reset the Bloom filter + * so that a fresh set of PTE tables can be recorded. + * 3. It ended the current iteration: it needs to reset the mm stats + * counters and tell its caller to increment max_seq. + */ + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); + + if (walk->seq <= mm_state->seq) + goto done; + + if (!mm_state->head) + mm_state->head = &mm_list->fifo; + + if (mm_state->head == &mm_list->fifo) + first = true; + + do { + mm_state->head = mm_state->head->next; + if (mm_state->head == &mm_list->fifo) { + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + last = true; + break; + } + + /* force scan for those added after the last iteration */ + if (!mm_state->tail || mm_state->tail == mm_state->head) { + mm_state->tail = mm_state->head->next; + walk->force_scan = true; + } + } while (!(mm = get_next_mm(walk))); +done: + if (*iter || last) + reset_mm_stats(walk, last); + + spin_unlock(&mm_list->lock); + + if (mm && first) + reset_bloom_filter(mm_state, walk->seq + 1); + + if (*iter) + mmput_async(*iter); + + *iter = mm; + + return last; +} + +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) +{ + bool success = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); + + if (seq > mm_state->seq) { + mm_state->head = NULL; + mm_state->tail = NULL; + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + success = true; + } + + spin_unlock(&mm_list->lock); + + return success; +} + +/****************************************************************************** + * PID controller + ******************************************************************************/ + +/* + * A feedback loop based on Proportional-Integral-Derivative (PID) controller. + * + * The P term is refaulted/(evicted+protected) from a tier in the generation + * currently being evicted; the I term is the exponential moving average of the + * P term over the generations previously evicted, using the smoothing factor + * 1/2; the D term isn't supported. + * + * The setpoint (SP) is always the first tier of one type; the process variable + * (PV) is either any tier of the other type or any other tier of the same + * type. + * + * The error is the difference between the SP and the PV; the correction is to + * turn off protection when SP>PV or turn on protection when SP<PV. + * + * For future optimizations: + * 1. The D term may discount the other two terms over time so that long-lived + * generations can resist stale information. + */ +struct ctrl_pos { + unsigned long refaulted; + unsigned long total; + int gain; +}; + +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + struct ctrl_pos *pos) +{ + int i; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->gain = gain; + pos->refaulted = pos->total = 0; + + for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { + pos->refaulted += lrugen->avg_refaulted[type][i] + + atomic_long_read(&lrugen->refaulted[hist][type][i]); + pos->total += lrugen->avg_total[type][i] + + lrugen->protected[hist][type][i] + + atomic_long_read(&lrugen->evicted[hist][type][i]); + } +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) +{ + int hist, tier; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + + lockdep_assert_held(&lruvec->lru_lock); + + if (!carryover && !clear) + return; + + hist = lru_hist_from_seq(seq); + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + WRITE_ONCE(lrugen->protected[hist][type][tier], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Return true if the PV has a limited number of refaults or a lower + * refaulted/total than the SP. + */ + return pv->refaulted < MIN_LRU_BATCH || + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= + (sp->refaulted + 1) * pv->total * pv->gain; +} + +/****************************************************************************** + * the aging + ******************************************************************************/ + +/* promote pages accessed through page tables */ +static int folio_update_gen(struct folio *folio, int gen) +{ + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } + + do { + /* lru_gen_del_folio() has isolated this page? */ + if (!(old_flags & LRU_GEN_MASK)) + return -1; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +/* protect pages accessed multiple times through file descriptors */ +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + int type = folio_is_file_lru(folio); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); + + VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); + + do { + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* folio_update_gen() has promoted this page? */ + if (new_gen >= 0 && new_gen != old_gen) + return new_gen; + + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + /* for folio_end_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); + + lru_gen_update_size(lruvec, folio, old_gen, new_gen); + + return new_gen; +} + +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, + int old_gen, int new_gen) +{ + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); + + walk->batched++; + + walk->nr_pages[old_gen][type][zone] -= delta; + walk->nr_pages[new_gen][type][zone] += delta; +} + +static void reset_batch_size(struct lru_gen_mm_walk *walk) +{ + int gen, type, zone; + struct lruvec *lruvec = walk->lruvec; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + walk->batched = 0; + + for_each_gen_type_zone(gen, type, zone) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + int delta = walk->nr_pages[gen][type][zone]; + + if (!delta) + continue; + + walk->nr_pages[gen][type][zone] = 0; + WRITE_ONCE(lrugen->nr_pages[gen][type][zone], + lrugen->nr_pages[gen][type][zone] + delta); + + if (lru_gen_is_active(lruvec, gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, delta); + } +} + +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) +{ + struct address_space *mapping; + struct vm_area_struct *vma = args->vma; + struct lru_gen_mm_walk *walk = args->private; + + if (!vma_is_accessible(vma)) + return true; + + if (is_vm_hugetlb_page(vma)) + return true; + + if (!vma_has_recency(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) + return true; + + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + if (vma_is_anonymous(vma)) + return !walk->swappiness; + + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) + return true; + + mapping = vma->vm_file->f_mapping; + if (mapping_unevictable(mapping)) + return true; + + if (shmem_mapping(mapping)) + return !walk->swappiness; + + if (walk->swappiness > MAX_SWAPPINESS) + return true; + + /* to exclude special mappings like dax, etc. */ + return !mapping->a_ops->read_folio; +} + +/* + * Some userspace memory allocators map many single-page VMAs. Instead of + * returning back to the PGD table for each of such VMAs, finish an entire PMD + * table to reduce zigzags and improve cache performance. */ +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, + unsigned long *vm_start, unsigned long *vm_end) +{ + unsigned long start = round_up(*vm_end, size); + unsigned long end = (start | ~mask) + 1; + VMA_ITERATOR(vmi, args->mm, start); + + VM_WARN_ON_ONCE(mask & size); + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); + + for_each_vma(vmi, args->vma) { + if (end && end <= args->vma->vm_start) + return false; + + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) + continue; + + *vm_start = max(start, args->vma->vm_start); + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; + + return true; + } + + return false; +} + +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) +{ + unsigned long pfn = pte_pfn(pte); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pte_present(pte) || is_zero_pfn(pfn)) + return -1; + + if (WARN_ON_ONCE(pte_special(pte))) + return -1; + + if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + + return pfn; +} + +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) +{ + unsigned long pfn = pmd_pfn(pmd); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) + return -1; + + if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + + return pfn; +} + +static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, + struct pglist_data *pgdat) +{ + struct folio *folio = pfn_folio(pfn); + + if (folio_lru_gen(folio) < 0) + return NULL; + + if (folio_nid(folio) != pgdat->node_id) + return NULL; + + if (folio_memcg(folio) != memcg) + return NULL; + + return folio; +} + +static bool suitable_to_scan(int total, int young) +{ + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); + + /* suitable if the average number of young PTEs per cacheline is >=1 */ + return young * n >= total; +} + +static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, + int new_gen, bool dirty) +{ + int old_gen; + + if (!folio) + return; + + if (dirty && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); + } +} + +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + bool dirty; + pte_t *pte; + spinlock_t *ptl; + unsigned long addr; + int total = 0; + int young = 0; + struct folio *last = NULL; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + DEFINE_MAX_SEQ(walk->lruvec); + int gen = lru_gen_from_seq(max_seq); + pmd_t pmdval; + + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); + if (!pte) + return false; + + if (!spin_trylock(ptl)) { + pte_unmap(pte); + return true; + } + + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + pte_unmap_unlock(pte, ptl); + return false; + } + + arch_enter_lazy_mmu_mode(); +restart: + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + struct folio *folio; + pte_t ptent = ptep_get(pte + i); + + total++; + walk->mm_stats[MM_LEAF_TOTAL]++; + + pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); + if (pfn == -1) + continue; + + folio = get_pfn_folio(pfn, memcg, pgdat); + if (!folio) + continue; + + if (!ptep_clear_young_notify(args->vma, addr, pte + i)) + continue; + + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pte_dirty(ptent)) + dirty = true; + + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; + } + + walk_update_folio(walk, last, gen, dirty); + last = NULL; + + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) + goto restart; + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte, ptl); + + return suitable_to_scan(total, young); +} + +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) +{ + int i; + bool dirty; + pmd_t *pmd; + spinlock_t *ptl; + struct folio *last = NULL; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + DEFINE_MAX_SEQ(walk->lruvec); + int gen = lru_gen_from_seq(max_seq); + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ + if (*first == -1) { + *first = addr; + bitmap_zero(bitmap, MIN_LRU_BATCH); + return; + } + + i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); + if (i && i <= MIN_LRU_BATCH) { + __set_bit(i - 1, bitmap); + return; + } + + pmd = pmd_offset(pud, *first); + + ptl = pmd_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + goto done; + + arch_enter_lazy_mmu_mode(); + + do { + unsigned long pfn; + struct folio *folio; + + /* don't round down the first address */ + addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; + + if (!pmd_present(pmd[i])) + goto next; + + if (!pmd_trans_huge(pmd[i])) { + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } + + pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); + if (pfn == -1) + goto next; + + folio = get_pfn_folio(pfn, memcg, pgdat); + if (!folio) + goto next; + + if (!pmdp_clear_young_notify(vma, addr, pmd + i)) + goto next; + + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pmd_dirty(pmd[i])) + dirty = true; + + walk->mm_stats[MM_LEAF_YOUNG]++; +next: + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; + } while (i <= MIN_LRU_BATCH); + + walk_update_folio(walk, last, gen, dirty); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); +done: + *first = -1; +} + +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pmd_t *pmd; + unsigned long next; + unsigned long addr; + struct vm_area_struct *vma; + DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); + unsigned long first = -1; + struct lru_gen_mm_walk *walk = args->private; + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* + * Finish an entire PMD in two passes: the first only reaches to PTE + * tables to avoid taking the PMD lock; the second, if necessary, takes + * the PMD lock to clear the accessed bit in PMD entries. + */ + pmd = pmd_offset(pud, start & PUD_MASK); +restart: + /* walk_pte_range() may call get_next_vma() */ + vma = args->vma; + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { + pmd_t val = pmdp_get_lockless(pmd + i); + + next = pmd_addr_end(addr, end); + + if (!pmd_present(val) || is_huge_zero_pmd(val)) { + walk->mm_stats[MM_LEAF_TOTAL]++; + continue; + } + + if (pmd_trans_huge(val)) { + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); + + walk->mm_stats[MM_LEAF_TOTAL]++; + + if (pfn != -1) + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + continue; + } + + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) { + if (!pmd_young(val)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + } + + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) + continue; + + walk->mm_stats[MM_NONLEAF_FOUND]++; + + if (!walk_pte_range(&val, addr, next, args)) + continue; + + walk->mm_stats[MM_NONLEAF_ADDED]++; + + /* carry over to the next generation */ + update_bloom_filter(mm_state, walk->seq + 1, pmd + i); + } + + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); + + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) + goto restart; +} + +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pud_t *pud; + unsigned long addr; + unsigned long next; + struct lru_gen_mm_walk *walk = args->private; + + VM_WARN_ON_ONCE(p4d_leaf(*p4d)); + + pud = pud_offset(p4d, start & P4D_MASK); +restart: + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { + pud_t val = pudp_get(pud + i); + + next = pud_addr_end(addr, end); + + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) + continue; + + walk_pmd_range(&val, addr, next, args); + + if (need_resched() || walk->batched >= MAX_LRU_BATCH) { + end = (addr | ~PUD_MASK) + 1; + goto done; + } + } + + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) + goto restart; + + end = round_up(end, P4D_SIZE); +done: + if (!end || !args->vma) + return 1; + + walk->next_addr = max(end, args->vma->vm_start); + + return -EAGAIN; +} + +static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + static const struct mm_walk_ops mm_walk_ops = { + .test_walk = should_skip_vma, + .p4d_entry = walk_pud_range, + .walk_lock = PGWALK_RDLOCK, + }; + int err; + struct lruvec *lruvec = walk->lruvec; + + walk->next_addr = FIRST_USER_ADDRESS; + + do { + DEFINE_MAX_SEQ(lruvec); + + err = -EBUSY; + + /* another thread might have called inc_max_seq() */ + if (walk->seq != max_seq) + break; + + /* the caller might be holding the lock for write */ + if (mmap_read_trylock(mm)) { + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); + + mmap_read_unlock(mm); + } + + if (walk->batched) { + spin_lock_irq(&lruvec->lru_lock); + reset_batch_size(walk); + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while (err == -EAGAIN); +} + +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + if (pgdat && current_is_kswapd()) { + VM_WARN_ON_ONCE(walk); + + walk = &pgdat->mm_walk; + } else if (!walk && force_alloc) { + VM_WARN_ON_ONCE(current_is_kswapd()); + + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + } + + current->reclaim_state->mm_walk = walk; + + return walk; +} + +static void clear_mm_walk(void) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); + + current->reclaim_state->mm_walk = NULL; + + if (!current_is_kswapd()) + kfree(walk); +} + +static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) +{ + int zone; + int remaining = MAX_LRU_BATCH; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + /* For file type, skip the check if swappiness is anon only */ + if (type && (swappiness == SWAPPINESS_ANON_ONLY)) + goto done; + + /* For anon type, skip the check if swappiness is zero (file only) */ + if (!type && !swappiness) + goto done; + + /* prevent cold/hot inversion if the type is evictable */ + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct list_head *head = &lrugen->folios[old_gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + new_gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + + if (!--remaining) + return false; + } + } +done: + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +} + +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) +{ + int gen, type, zone; + bool success = false; + bool seq_inc_flag = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + /* find the oldest populated generation */ + for_each_evictable_type(type, swappiness) { + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + if (!list_empty(&lrugen->folios[gen][type][zone])) + goto next; + } + + min_seq[type]++; + seq_inc_flag = true; + } +next: + ; + } + + /* + * If min_seq[type] of both anonymous and file is not increased, + * we can directly return false to avoid unnecessary checking + * overhead later. + */ + if (!seq_inc_flag) + return success; + + /* see the comment on lru_gen_folio */ + if (swappiness && swappiness <= MAX_SWAPPINESS) { + unsigned long seq = lrugen->max_seq - MIN_NR_GENS; + + if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) + min_seq[LRU_GEN_ANON] = seq; + else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) + min_seq[LRU_GEN_FILE] = seq; + } + + for_each_evictable_type(type, swappiness) { + if (min_seq[type] <= lrugen->min_seq[type]) + continue; + + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } + + return success; +} + +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) +{ + bool success; + int prev, next; + int type, zone; + struct lru_gen_folio *lrugen = &lruvec->lrugen; +restart: + if (seq < READ_ONCE(lrugen->max_seq)) + return false; + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + success = seq == lrugen->max_seq; + if (!success) + goto unlock; + + for (type = 0; type < ANON_AND_FILE; type++) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + + if (inc_min_seq(lruvec, type, swappiness)) + continue; + + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + goto restart; + } + + /* + * Update the active/inactive LRU sizes for compatibility. Both sides of + * the current max_seq need to be covered, since max_seq+1 can overlap + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do + * overlap, cold/hot inversion happens. + */ + prev = lru_gen_from_seq(lrugen->max_seq - 1); + next = lru_gen_from_seq(lrugen->max_seq + 1); + + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + long delta = lrugen->nr_pages[prev][type][zone] - + lrugen->nr_pages[next][type][zone]; + + if (!delta) + continue; + + __update_lru_size(lruvec, lru, zone, delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); + } + } + + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + + WRITE_ONCE(lrugen->timestamps[next], jiffies); + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +unlock: + spin_unlock_irq(&lruvec->lru_lock); + + return success; +} + +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, + int swappiness, bool force_scan) +{ + bool success; + struct lru_gen_mm_walk *walk; + struct mm_struct *mm = NULL; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); + + if (!mm_state) + return inc_max_seq(lruvec, seq, swappiness); + + /* see the comment in iterate_mm_list() */ + if (seq <= READ_ONCE(mm_state->seq)) + return false; + + /* + * If the hardware doesn't automatically set the accessed bit, fallback + * to lru_gen_look_around(), which only clears the accessed bit in a + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ + if (!should_walk_mmu()) { + success = iterate_mm_list_nowalk(lruvec, seq); + goto done; + } + + walk = set_mm_walk(NULL, true); + if (!walk) { + success = iterate_mm_list_nowalk(lruvec, seq); + goto done; + } + + walk->lruvec = lruvec; + walk->seq = seq; + walk->swappiness = swappiness; + walk->force_scan = force_scan; + + do { + success = iterate_mm_list(walk, &mm); + if (mm) + walk_mm(mm, walk); + } while (mm); +done: + if (success) { + success = inc_max_seq(lruvec, seq, swappiness); + WARN_ON_ONCE(!success); + } + + return success; +} + +/****************************************************************************** + * working set protection + ******************************************************************************/ + +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ + int priority; + unsigned long reclaimable; + + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) + return; + /* + * Determine the initial priority based on + * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, + * where reclaimed_to_scanned_ratio = inactive / total. + */ + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + + /* + * The estimation is based on LRU pages only, so cap it to prevent + * overshoots of shrinker objects by large margins. + */ + sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); +} + +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) +{ + int gen, type, zone; + unsigned long total = 0; + int swappiness = get_swappiness(lruvec, sc); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + for_each_evictable_type(type, swappiness) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + } + } + + /* whether the size is big enough to be helpful */ + return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; +} + +static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, + unsigned long min_ttl) +{ + int gen; + unsigned long birth; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(NULL, memcg)) + return false; + + if (!lruvec_is_sizable(lruvec, sc)) + return false; + + gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + return time_is_before_jiffies(birth + min_ttl); +} + +/* to protect the working set of the last N jiffies */ +static unsigned long lru_gen_min_ttl __read_mostly; + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + bool reclaimable = !min_ttl; + + VM_WARN_ON_ONCE(!current_is_kswapd()); + + set_initial_priority(pgdat, sc); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + mem_cgroup_calculate_protection(NULL, memcg); + + if (!reclaimable) + reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + /* + * The main goal is to OOM kill if every generation from all memcgs is + * younger than min_ttl. However, another possibility is all memcgs are + * either too small or below min. + */ + if (!reclaimable && mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + }; + + out_of_memory(&oc); + + mutex_unlock(&oom_lock); + } +} + +/****************************************************************************** + * rmap/PT walk feedback + ******************************************************************************/ + +/* + * This function exploits spatial locality when shrink_folio_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If + * the scan was done cacheline efficiently, it adds the PMD entry pointing to + * the PTE table to the Bloom filter. This forms a feedback loop between the + * eviction and the aging. + */ +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + bool dirty; + unsigned long start; + unsigned long end; + struct lru_gen_mm_walk *walk; + struct folio *last = NULL; + int young = 1; + pte_t *pte = pvmw->pte; + unsigned long addr = pvmw->address; + struct vm_area_struct *vma = pvmw->vma; + struct folio *folio = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + DEFINE_MAX_SEQ(lruvec); + int gen = lru_gen_from_seq(max_seq); + + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + + if (!ptep_clear_young_notify(vma, addr, pte)) + return false; + + if (spin_is_contended(pvmw->ptl)) + return true; + + /* exclude special VMAs containing anon pages from COW */ + if (vma->vm_flags & VM_SPECIAL) + return true; + + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + + start = max(addr & PMD_MASK, vma->vm_start); + end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; + + if (end - start == PAGE_SIZE) + return true; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; + else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { + start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + + arch_enter_lazy_mmu_mode(); + + pte -= (addr - start) / PAGE_SIZE; + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + pte_t ptent = ptep_get(pte + i); + + pfn = get_pte_pfn(ptent, vma, addr, pgdat); + if (pfn == -1) + continue; + + folio = get_pfn_folio(pfn, memcg, pgdat); + if (!folio) + continue; + + if (!ptep_clear_young_notify(vma, addr, pte + i)) + continue; + + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } + + if (pte_dirty(ptent)) + dirty = true; + + young++; + } + + walk_update_folio(walk, last, gen, dirty); + + arch_leave_lazy_mmu_mode(); + + /* feedback from rmap walkers to page table walkers */ + if (mm_state && suitable_to_scan(i, young)) + update_bloom_filter(mm_state, max_seq, pvmw->pmd); + + return true; +} + +/****************************************************************************** + * memcg LRU + ******************************************************************************/ + +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + unsigned long flags; + int bin = get_random_u32_below(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irqsave(&pgdat->memcg_lru.lock, flags); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + WRITE_ONCE(lruvec->lrugen.seg, seg); + WRITE_ONCE(lruvec->lrugen.gen, new); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); +} + +#ifdef CONFIG_MEMCG + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = get_random_u32_below(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock_irq(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + lruvec->lrugen.gen = gen; + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + spin_unlock_irq(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); + } +} + +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock_irq(&pgdat->memcg_lru.lock); + + if (hlist_nulls_unhashed(&lruvec->lrugen.list)) + goto unlock; + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_init_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); +unlock: + spin_unlock_irq(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) +{ + struct lruvec *lruvec = get_lruvec(memcg, nid); + + /* see the comment on MEMCG_NR_GENS */ + if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); +} + +#endif /* CONFIG_MEMCG */ + +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, + int tier_idx) +{ + bool success; + bool dirty, writeback; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); + + /* unevictable */ + if (!folio_evictable(folio)) { + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + folio_set_unevictable(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + + /* protected */ + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { + gen = folio_inc_gen(lruvec, folio, false); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + return true; + } + + /* ineligible */ + if (zone > sc->reclaim_idx) { + gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + + dirty = folio_test_dirty(folio); + writeback = folio_test_writeback(folio); + if (type == LRU_GEN_FILE && dirty) { + sc->nr.file_taken += delta; + if (!writeback) + sc->nr.unqueued_dirty += delta; + } + + /* waiting for writeback */ + if (writeback || (type == LRU_GEN_FILE && dirty)) { + gen = folio_inc_gen(lruvec, folio, true); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + + return false; +} + +static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) +{ + bool success; + + /* swap constrained */ + if (!(sc->gfp_mask & __GFP_IO) && + (folio_test_dirty(folio) || + (folio_test_anon(folio) && !folio_test_swapcache(folio)))) + return false; + + /* raced with release_pages() */ + if (!folio_try_get(folio)) + return false; + + /* raced with another isolation */ + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return false; + } + + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio)) + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); + + /* for shrink_folio_list() */ + folio_clear_reclaim(folio); + + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + + return true; +} + +static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int type, int tier, + struct list_head *list) +{ + int i; + int gen; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int skipped = 0; + int remaining = min(nr_to_scan, MAX_LRU_BATCH); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (i = MAX_NR_ZONES; i > 0; i--) { + LIST_HEAD(moved); + int skipped_zone = 0; + int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; + struct list_head *head = &lrugen->folios[gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); + int delta = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + scanned += delta; + + if (sort_folio(lruvec, folio, sc, tier)) + sorted += delta; + else if (isolate_folio(lruvec, folio, sc)) { + list_add(&folio->lru, list); + isolated += delta; + } else { + list_move(&folio->lru, &moved); + skipped_zone += delta; + } + + if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH) + break; + } + + if (skipped_zone) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone); + skipped += skipped_zone; + } + + if (!remaining || isolated >= MIN_LRU_BATCH) + break; + } + + item = PGSCAN_KSWAPD + reclaimer_offset(sc); + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + count_memcg_events(memcg, item, isolated); + count_memcg_events(memcg, PGREFILL, sorted); + __count_vm_events(PGSCAN_ANON + type, isolated); + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, + scanned, skipped, isolated, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); + if (type == LRU_GEN_FILE) + sc->nr.file_taken += isolated; + /* + * There might not be eligible folios due to reclaim_idx. Check the + * remaining to prevent livelock if it's not making progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * To leave a margin for fluctuations, use a larger gain factor (2:3). + * This value is chosen because any other tier would have at least twice + * as many refaults as the first tier. + */ + read_ctrl_pos(lruvec, type, 0, 2, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 3, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness) +{ + struct ctrl_pos sp, pv; + + if (swappiness <= MIN_SWAPPINESS + 1) + return LRU_GEN_FILE; + + if (swappiness >= MAX_SWAPPINESS) + return LRU_GEN_ANON; + /* + * Compare the sum of all tiers of anon with that of file to determine + * which type to scan. + */ + read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); + + return positive_ctrl_err(&sp, &pv); +} + +static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type = get_type_to_scan(lruvec, swappiness); + + for_each_evictable_type(i, swappiness) { + int scanned; + int tier = get_tier_idx(lruvec, type); + + *type_scanned = type; + + scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); + if (scanned) + return scanned; + + type = !type; + } + + return 0; +} + +static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + LIST_HEAD(clean); + struct folio *folio; + struct folio *next; + enum vm_event_item item; + struct reclaim_stat stat; + struct lru_gen_mm_walk *walk; + bool skip_retry = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec->lru_lock); + + scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); + + scanned += try_to_inc_min_seq(lruvec, swappiness); + + if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) + scanned = 0; + + spin_unlock_irq(&lruvec->lru_lock); + + if (list_empty(&list)) + return scanned; +retry: + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr_reclaimed += reclaimed; + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + scanned, reclaimed, &stat, sc->priority, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); + + list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + + if (!folio_evictable(folio)) { + list_del(&folio->lru); + folio_putback_lru(folio); + continue; + } + + /* retry folios that may have missed folio_rotate_reclaimable() */ + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); + continue; + } + + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); + } + + spin_lock_irq(&lruvec->lru_lock); + + move_folios_to_lru(lruvec, &list); + + walk = current->reclaim_state->mm_walk; + if (walk && walk->batched) { + walk->lruvec = lruvec; + reset_batch_size(walk); + } + + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + stat.nr_demoted); + + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + count_memcg_events(memcg, item, reclaimed); + __count_vm_events(PGSTEAL_ANON + type, reclaimed); + + spin_unlock_irq(&lruvec->lru_lock); + + list_splice_init(&clean, &list); + + if (!list_empty(&list)) { + skip_retry = true; + goto retry; + } + + return scanned; +} + +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, + int swappiness, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long size = 0; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + *nr_to_scan = 0; + /* have to run aging, since eviction is not possible anymore */ + if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) + return true; + + for_each_evictable_type(type, swappiness) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + } + } + + *nr_to_scan = size; + /* better to run aging even though eviction is still possible */ + return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; +} + +/* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +{ + bool success; + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) + return -1; + + success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); + + /* try to scrape all its memory if this memcg was deleted */ + if (nr_to_scan && !mem_cgroup_online(memcg)) + return nr_to_scan; + + nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); + + /* try to get away with not aging at the default priority */ + if (!success || sc->priority == DEF_PRIORITY) + return nr_to_scan >> sc->priority; + + /* stop scanning this lruvec as it's low on cold folios */ + return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; +} + +static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) +{ + int i; + enum zone_watermarks mark; + + /* don't abort memcg reclaim to ensure fairness */ + if (!root_reclaim(sc)) + return false; + + if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) + return true; + + /* check the order to exclude compaction-induced reclaim */ + if (!current_is_kswapd() || sc->order) + return false; + + mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ? + WMARK_PROMO : WMARK_HIGH; + + for (i = 0; i <= sc->reclaim_idx; i++) { + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH; + + if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) + return false; + } + + /* kswapd should abort if all eligible zones are safe */ + return true; +} + +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + long nr_to_scan; + unsigned long scanned = 0; + int swappiness = get_swappiness(lruvec, sc); + + while (true) { + int delta; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); + if (nr_to_scan <= 0) + break; + + delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); + if (!delta) + break; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + if (should_abort_scan(lruvec, sc)) + break; + + cond_resched(); + } + + /* + * If too many file cache in the coldest generation can't be evicted + * due to being dirty, wake up the flusher. + */ + if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) + wakeup_flusher_threads(WB_REASON_VMSCAN); + + /* whether this lruvec should be rotated */ + return nr_to_scan < 0; +} + +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +{ + bool success; + unsigned long scanned = sc->nr_scanned; + unsigned long reclaimed = sc->nr_reclaimed; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ + if (mem_cgroup_below_min(NULL, memcg)) + return MEMCG_LRU_YOUNG; + + if (mem_cgroup_below_low(NULL, memcg)) { + /* see the comment on MEMCG_NR_GENS */ + if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) + return MEMCG_LRU_TAIL; + + memcg_memory_event(memcg, MEMCG_LOW); + } + + success = try_to_shrink_lruvec(lruvec, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + flush_reclaim_state(sc); + + if (success && mem_cgroup_online(memcg)) + return MEMCG_LRU_YOUNG; + + if (!success && lruvec_is_sizable(lruvec, sc)) + return 0; + + /* one retry if offlined or too small */ + return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ? + MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; +} + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + int op; + int gen; + int bin; + int first_bin; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + struct mem_cgroup *memcg; + struct hlist_nulls_node *pos; + + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); + bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); +restart: + op = 0; + memcg = NULL; + + rcu_read_lock(); + + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { + if (op) { + lru_gen_rotate_memcg(lruvec, op); + op = 0; + } + + mem_cgroup_put(memcg); + memcg = NULL; + + if (gen != READ_ONCE(lrugen->gen)) + continue; + + lruvec = container_of(lrugen, struct lruvec, lrugen); + memcg = lruvec_memcg(lruvec); + + if (!mem_cgroup_tryget(memcg)) { + lru_gen_release_memcg(memcg); + memcg = NULL; + continue; + } + + rcu_read_unlock(); + + op = shrink_one(lruvec, sc); + + rcu_read_lock(); + + if (should_abort_scan(lruvec, sc)) + break; + } + + rcu_read_unlock(); + + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + if (!is_a_nulls(pos)) + return; + + /* restart if raced with lru_gen_rotate_memcg() */ + if (gen != get_nulls_value(pos)) + goto restart; + + /* try the rest of the bins of the current generation */ + bin = get_memcg_bin(bin + 1); + if (bin != first_bin) + goto restart; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + + VM_WARN_ON_ONCE(root_reclaim(sc)); + VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(NULL, sc->proactive); + + if (try_to_shrink_lruvec(lruvec, sc)) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); + + clear_mm_walk(); + + blk_finish_plug(&plug); +} + +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct blk_plug plug; + unsigned long reclaimed = sc->nr_reclaimed; + + VM_WARN_ON_ONCE(!root_reclaim(sc)); + + /* + * Unmapped clean folios are already prioritized. Scanning for more of + * them is likely futile and can cause high reclaim latency when there + * is a large number of memcgs. + */ + if (!sc->may_writepage || !sc->may_unmap) + goto done; + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(pgdat, sc->proactive); + + set_initial_priority(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed = 0; + + if (mem_cgroup_disabled()) + shrink_one(&pgdat->__lruvec, sc); + else + shrink_many(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed += reclaimed; + + clear_mm_walk(); + + blk_finish_plug(&plug); +done: + if (sc->nr_reclaimed > reclaimed) + atomic_set(&pgdat->kswapd_failures, 0); +} + +/****************************************************************************** + * state change + ******************************************************************************/ + +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) +{ + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + if (lrugen->enabled) { + enum lru_list lru; + + for_each_evictable_lru(lru) { + if (!list_empty(&lruvec->lists[lru])) + return false; + } + } else { + int gen, type, zone; + + for_each_gen_type_zone(gen, type, zone) { + if (!list_empty(&lrugen->folios[gen][type][zone])) + return false; + } + } + + return true; +} + +static bool fill_evictable(struct lruvec *lruvec) +{ + enum lru_list lru; + int remaining = MAX_LRU_BATCH; + + for_each_evictable_lru(lru) { + int type = is_file_lru(lru); + bool active = is_active_lru(lru); + struct list_head *head = &lruvec->lists[lru]; + + while (!list_empty(head)) { + bool success; + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); + + lruvec_del_folio(lruvec, folio); + success = lru_gen_add_folio(lruvec, folio, false); + VM_WARN_ON_ONCE(!success); + + if (!--remaining) + return false; + } + } + + return true; +} + +static bool drain_evictable(struct lruvec *lruvec) +{ + int gen, type, zone; + int remaining = MAX_LRU_BATCH; + + for_each_gen_type_zone(gen, type, zone) { + struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; + + while (!list_empty(head)) { + bool success; + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + success = lru_gen_del_folio(lruvec, folio, false); + VM_WARN_ON_ONCE(!success); + lruvec_add_folio(lruvec, folio); + + if (!--remaining) + return false; + } + } + + return true; +} + +static void lru_gen_change_state(bool enabled) +{ + static DEFINE_MUTEX(state_mutex); + + struct mem_cgroup *memcg; + + cgroup_lock(); + cpus_read_lock(); + get_online_mems(); + mutex_lock(&state_mutex); + + if (enabled == lru_gen_enabled()) + goto unlock; + + if (enabled) + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + else + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + VM_WARN_ON_ONCE(!state_is_valid(lruvec)); + + lruvec->lrugen.enabled = enabled; + + while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +unlock: + mutex_unlock(&state_mutex); + put_online_mems(); + cpus_read_unlock(); + cgroup_unlock(); +} + +/****************************************************************************** + * sysfs interface + ******************************************************************************/ + +static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned int msecs; + + if (kstrtouint(buf, 0, &msecs)) + return -EINVAL; + + WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); + + return len; +} + +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms); + +static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned int caps = 0; + + if (get_cap(LRU_GEN_CORE)) + caps |= BIT(LRU_GEN_CORE); + + if (should_walk_mmu()) + caps |= BIT(LRU_GEN_MM_WALK); + + if (should_clear_pmd_young()) + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); + + return sysfs_emit(buf, "0x%04x\n", caps); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int i; + unsigned int caps; + + if (tolower(*buf) == 'n') + caps = 0; + else if (tolower(*buf) == 'y') + caps = -1; + else if (kstrtouint(buf, 0, &caps)) + return -EINVAL; + + for (i = 0; i < NR_LRU_GEN_CAPS; i++) { + bool enabled = caps & BIT(i); + + if (i == LRU_GEN_CORE) + lru_gen_change_state(enabled); + else if (enabled) + static_branch_enable(&lru_gen_caps[i]); + else + static_branch_disable(&lru_gen_caps[i]); + } + + return len; +} + +static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled); + +static struct attribute *lru_gen_attrs[] = { + &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL +}; + +static const struct attribute_group lru_gen_attr_group = { + .name = "lru_gen", + .attrs = lru_gen_attrs, +}; + +/****************************************************************************** + * debugfs interface + ******************************************************************************/ + +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg; + loff_t nr_to_skip = *pos; + + m->private = kvmalloc(PATH_MAX, GFP_KERNEL); + if (!m->private) + return ERR_PTR(-ENOMEM); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node_state(nid, N_MEMORY) { + if (!nr_to_skip--) + return get_lruvec(memcg, nid); + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + return NULL; +} + +static void lru_gen_seq_stop(struct seq_file *m, void *v) +{ + if (!IS_ERR_OR_NULL(v)) + mem_cgroup_iter_break(NULL, lruvec_memcg(v)); + + kvfree(m->private); + m->private = NULL; +} + +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int nid = lruvec_pgdat(v)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(v); + + ++*pos; + + nid = next_memory_node(nid); + if (nid == MAX_NUMNODES) { + memcg = mem_cgroup_iter(NULL, memcg, NULL); + if (!memcg) + return NULL; + + nid = first_memory_node; + } + + return get_lruvec(memcg, nid); +} + +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + unsigned long max_seq, unsigned long *min_seq, + unsigned long seq) +{ + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); + for (type = 0; type < ANON_AND_FILE; type++) { + const char *s = "xxx"; + unsigned long n[3] = {}; + + if (seq == max_seq) { + s = "RTx"; + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); + n[1] = READ_ONCE(lrugen->avg_total[type][tier]); + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { + s = "rep"; + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); + n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); + } + + for (i = 0; i < 3; i++) + seq_printf(m, " %10lu%c", n[i], s[i]); + } + seq_putc(m, '\n'); + } + + if (!mm_state) + return; + + seq_puts(m, " "); + for (i = 0; i < NR_MM_STATS; i++) { + const char *s = "xxxx"; + unsigned long n = 0; + + if (seq == max_seq && NR_HIST_GENS == 1) { + s = "TYFA"; + n = READ_ONCE(mm_state->stats[hist][i]); + } else if (seq != max_seq && NR_HIST_GENS > 1) { + s = "tyfa"; + n = READ_ONCE(mm_state->stats[hist][i]); + } + + seq_printf(m, " %10lu%c", n, s[i]); + } + seq_putc(m, '\n'); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static int lru_gen_seq_show(struct seq_file *m, void *v) +{ + unsigned long seq; + bool full = debugfs_get_aux_num(m->file); + struct lruvec *lruvec = v; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (nid == first_memory_node) { + const char *path = memcg ? m->private : ""; + +#ifdef CONFIG_MEMCG + if (memcg) + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); +#endif + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + } + + seq_printf(m, " node %5d\n", nid); + + if (!full) + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); + else if (max_seq >= MAX_NR_GENS) + seq = max_seq - MAX_NR_GENS + 1; + else + seq = 0; + + for (; seq <= max_seq; seq++) { + int type, zone; + int gen = lru_gen_from_seq(seq); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); + + for (type = 0; type < ANON_AND_FILE; type++) { + unsigned long size = 0; + char mark = full && seq < min_seq[type] ? 'x' : ' '; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + seq_printf(m, " %10lu%c", size, mark); + } + + seq_putc(m, '\n'); + + if (full) + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + } + + return 0; +} + +static const struct seq_operations lru_gen_seq_ops = { + .start = lru_gen_seq_start, + .stop = lru_gen_seq_stop, + .next = lru_gen_seq_next, + .show = lru_gen_seq_show, +}; + +static int run_aging(struct lruvec *lruvec, unsigned long seq, + int swappiness, bool force_scan) +{ + DEFINE_MAX_SEQ(lruvec); + + if (seq > max_seq) + return -EINVAL; + + return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; +} + +static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + int swappiness, unsigned long nr_to_reclaim) +{ + DEFINE_MAX_SEQ(lruvec); + + if (seq + MIN_NR_GENS > max_seq) + return -EINVAL; + + sc->nr_reclaimed = 0; + + while (!signal_pending(current)) { + DEFINE_MIN_SEQ(lruvec); + + if (seq < evictable_min_seq(min_seq, swappiness)) + return 0; + + if (sc->nr_reclaimed >= nr_to_reclaim) + return 0; + + if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, + swappiness)) + return 0; + + cond_resched(); + } + + return -EINTR; +} + +static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + struct scan_control *sc, int swappiness, unsigned long opt) +{ + struct lruvec *lruvec; + int err = -EINVAL; + struct mem_cgroup *memcg = NULL; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) + return -EINVAL; + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); + + memcg = mem_cgroup_from_id(memcg_id); + if (!mem_cgroup_tryget(memcg)) + memcg = NULL; + + rcu_read_unlock(); + + if (!memcg) + return -EINVAL; + } + + if (memcg_id != mem_cgroup_id(memcg)) + goto done; + + sc->target_mem_cgroup = memcg; + lruvec = get_lruvec(memcg, nid); + + if (swappiness < MIN_SWAPPINESS) + swappiness = get_swappiness(lruvec, sc); + else if (swappiness > SWAPPINESS_ANON_ONLY) + goto done; + + switch (cmd) { + case '+': + err = run_aging(lruvec, seq, swappiness, opt); + break; + case '-': + err = run_eviction(lruvec, seq, sc, swappiness, opt); + break; + } +done: + mem_cgroup_put(memcg); + + return err; +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + size_t len, loff_t *pos) +{ + void *buf; + char *cur, *next; + unsigned int flags; + struct blk_plug plug; + int err = -EINVAL; + struct scan_control sc = { + .may_writepage = true, + .may_unmap = true, + .may_swap = true, + .reclaim_idx = MAX_NR_ZONES - 1, + .gfp_mask = GFP_KERNEL, + .proactive = true, + }; + + buf = kvmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, src, len)) { + kvfree(buf); + return -EFAULT; + } + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + blk_start_plug(&plug); + if (!set_mm_walk(NULL, true)) { + err = -ENOMEM; + goto done; + } + + next = buf; + next[len] = '\0'; + + while ((cur = strsep(&next, ",;\n"))) { + int n; + int end; + char cmd, swap_string[5]; + unsigned int memcg_id; + unsigned int nid; + unsigned long seq; + unsigned int swappiness; + unsigned long opt = -1; + + cur = skip_spaces(cur); + if (!*cur) + continue; + + n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, swap_string, &end, &opt, &end); + if (n < 4 || cur[end]) { + err = -EINVAL; + break; + } + + if (n == 4) { + swappiness = -1; + } else if (!strcmp("max", swap_string)) { + /* set by userspace for anonymous memory only */ + swappiness = SWAPPINESS_ANON_ONLY; + } else { + err = kstrtouint(swap_string, 0, &swappiness); + if (err) + break; + } + + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); + if (err) + break; + } +done: + clear_mm_walk(); + blk_finish_plug(&plug); + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + kvfree(buf); + + return err ? : len; +} + +static int lru_gen_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lru_gen_seq_ops); +} + +static const struct file_operations lru_gen_rw_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .write = lru_gen_seq_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations lru_gen_ro_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/****************************************************************************** + * initialization + ******************************************************************************/ + +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ + int i, j; + + spin_lock_init(&pgdat->memcg_lru.lock); + + for (i = 0; i < MEMCG_NR_GENS; i++) { + for (j = 0; j < MEMCG_NR_BINS; j++) + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); + } +} + +void lru_gen_init_lruvec(struct lruvec *lruvec) +{ + int i; + int gen, type, zone; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); + + if (mm_state) + mm_state->seq = MIN_NR_GENS; +} + +#ifdef CONFIG_MEMCG + +void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + if (!mm_list) + return; + + INIT_LIST_HEAD(&mm_list->fifo); + spin_lock_init(&mm_list->lock); +} + +void lru_gen_exit_memcg(struct mem_cgroup *memcg) +{ + int i; + int nid; + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); + + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + + lruvec->lrugen.list.next = LIST_POISON1; + + if (!mm_state) + continue; + + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(mm_state->filters[i]); + mm_state->filters[i] = NULL; + } + } +} + +#endif /* CONFIG_MEMCG */ + +static int __init init_lru_gen(void) +{ + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + + debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false, + &lru_gen_rw_fops); + debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true, + &lru_gen_ro_fops); + + return 0; +}; +late_initcall(init_lru_gen); + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + BUILD_BUG(); +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + BUILD_BUG(); +} + +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + BUILD_BUG(); +} + +#endif /* CONFIG_LRU_GEN */ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; @@ -1944,14 +5775,33 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + bool proportional_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + + if (lru_gen_enabled() && !root_reclaim(sc)) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -1968,21 +5818,14 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) } } - if (nr_reclaimed < nr_to_reclaim || scan_adjusted) - continue; + cond_resched(); - /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; + if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) + continue; /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -1990,6 +5833,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; @@ -2019,8 +5871,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); - - scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; @@ -2029,17 +5879,16 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(lruvec)) + if (can_age_anon_pages(lruvec, sc) && + inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); - - throttle_vm_writeout(sc->gfp_mask); } /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -2051,41 +5900,46 @@ static bool in_reclaim_compaction(struct scan_control *sc) * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator - * calls try_to_compact_zone() that it will have enough free pages to succeed. + * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, - unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; + int z; + struct zone *zone; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) return false; - /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_REPEAT) { - /* - * For __GFP_REPEAT allocations, stop reclaiming if the - * full LRU list has been scanned and we are still failing - * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_REPEAT caller really wants to succeed - */ - if (!nr_reclaimed && !nr_scanned) + /* + * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX + * number of pages that were scanned. This will return to the caller + * with the risk reclaim/compaction and the resulting allocation attempt + * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL + * allocations through requiring that the full LRU list has been scanned + * first, by assuming that zero delta of sc->nr_scanned means full LRU + * scan, but that approximation was wrong, and there were corner cases + * where always a non-zero amount of pages were scanned. + */ + if (!nr_reclaimed) + return false; + + /* If compaction would go ahead or the allocation would succeed, stop */ + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { + unsigned long watermark = min_wmark_pages(zone); + + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, watermark, + sc->reclaim_idx, 0)) return false; - } else { - /* - * For non-__GFP_REPEAT allocations which can presumably - * fail without consequence, stop if we failed to reclaim - * any pages from the last SWAP_CLUSTER_MAX number of - * pages that were scanned. This will return to the - * caller faster at the risk reclaim/compaction and - * the resulting allocation attempt fails - */ - if (!nr_reclaimed) + + if (compaction_suitable(zone, sc->order, watermark, + sc->reclaim_idx)) return false; } @@ -2093,107 +5947,265 @@ static inline bool should_continue_reclaim(struct zone *zone, * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming */ - pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) - inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); - if (sc->nr_reclaimed < pages_for_compaction && - inactive_lru_pages > pages_for_compaction) - return true; + pages_for_compaction = compact_gap(sc->order); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); - /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { - case COMPACT_PARTIAL: - case COMPACT_CONTINUE: - return false; - default: - return true; - } + return inactive_lru_pages > pages_for_compaction; } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { - unsigned long nr_reclaimed, nr_scanned; - - do { - struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, - .priority = sc->priority, - }; - struct mem_cgroup *memcg; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .pgdat = pgdat, + }; + struct mem_cgroup_reclaim_cookie *partial = &reclaim; + struct mem_cgroup *memcg; - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; + /* + * In most cases, direct reclaimers can do partial walks + * through the cgroup tree, using an iterator state that + * persists across invocations. This strikes a balance between + * fairness and allocation latency. + * + * For kswapd, reliable forward progress is more important + * than a quick return to idle. Always do full walks. + */ + if (current_is_kswapd() || sc->memcg_full_walk) + partial = NULL; - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { - struct lruvec *lruvec; + memcg = mem_cgroup_iter(target_memcg, NULL, partial); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long reclaimed; + unsigned long scanned; - lruvec = mem_cgroup_zone_lruvec(zone, memcg); + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); - shrink_lruvec(lruvec, sc); + mem_cgroup_calculate_protection(target_memcg, memcg); + if (mem_cgroup_below_min(target_memcg, memcg)) { /* - * Direct reclaim and kswapd have to scan all memory - * cgroups to fulfill the overall scan target for the - * zone. - * - * Limit reclaim, on the other hand, only cares about - * nr_to_reclaim pages to be reclaimed and it will - * retry with decreasing priority if one round over the - * whole hierarchy is not sufficient. + * Hard protection. + * If there is no reclaimable memory, OOM. */ - if (!global_reclaim(sc) && - sc->nr_reclaimed >= sc->nr_to_reclaim) { - mem_cgroup_iter_break(root, memcg); - break; + continue; + } else if (mem_cgroup_below_low(target_memcg, memcg)) { + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + memcg_memory_event(memcg, MEMCG_LOW); + } + + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + + shrink_lruvec(lruvec, sc); - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); - } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)); + /* Record the group's reclaim efficiency */ + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + /* If partial walks are allowed, bail once goal is reached */ + if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(target_memcg, memcg); + break; + } + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial))); } -/* Returns true if compaction should go ahead for a high-order request */ +static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; + struct lruvec *target_lruvec; + bool reclaimable = false; + + if (lru_gen_enabled() && root_reclaim(sc)) { + memset(&sc->nr, 0, sizeof(sc->nr)); + lru_gen_shrink_node(pgdat, sc); + return; + } + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + +again: + memset(&sc->nr, 0, sizeof(sc->nr)); + + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + + prepare_scan_control(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + + flush_reclaim_state(sc); + + nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed; + + /* Record the subtree's reclaim efficiency */ + if (!sc->proactive) + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, nr_node_reclaimed); + + if (nr_node_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* + * If kswapd scans pages marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so forcibly stall + * until some pages complete writeback. + */ + if (sc->nr.immediate) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + + /* + * Tag a node/memcg as congested if all the dirty pages were marked + * for writeback and immediate reclaim (counted in nr.congested). + * + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in reclaim_throttle(). + */ + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) + set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); + + if (current_is_kswapd()) + set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); + } + + /* + * Stall direct reclaim for IO completions if the lruvec is + * node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || + test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); + + if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) + goto again; + + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + atomic_set(&pgdat->kswapd_failures, 0); + else if (sc->cache_trim_mode) + sc->cache_trim_mode_failed = 1; +} + +/* + * Returns true if compaction should go ahead for a costly-order request, or + * the allocation would already succeed without compaction. Return false if we + * should reclaim first. + */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { - unsigned long balance_gap, watermark; - bool watermark_ok; + unsigned long watermark; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + if (!gfp_compaction_allowed(sc->gfp_mask)) return false; + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) + return true; + /* - * Compaction takes time to run and there are potentially other - * callers using the pages just freed. Continue reclaiming until - * there is a buffer of free pages available to give compaction - * a reasonable chance of completing and allocating the page + * Direct reclaim usually targets the min watermark, but compaction + * takes time to run and there are potentially other callers using the + * pages just freed. So target a higher buffer to give compaction a + * reasonable chance of completing and allocating the pages. + * + * Note that we won't actually reclaim the whole buffer in one attempt + * as the target watermark in should_continue_reclaim() is lower. But if + * we are already above the high+gap watermark, don't reclaim at all. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + watermark = high_wmark_pages(zone); + if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx)) + return true; + + return false; +} +static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) +{ /* - * If compaction is deferred, reclaim up to a point where - * compaction will have a chance of success when re-enabled + * If reclaim is making progress greater than 12% efficiency then + * wake all the NOPROGRESS throttled tasks. */ - if (compaction_deferred(zone, sc->order)) - return watermark_ok; + if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { + wait_queue_head_t *wqh; - /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) - return false; + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; + if (waitqueue_active(wqh)) + wake_up(wqh); + + return; + } - return watermark_ok; + /* + * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will + * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages + * under writeback and marked for immediate reclaim at the tail of the + * LRU. + */ + if (current_is_kswapd() || cgroup_reclaim(sc)) + return; + + /* Throttle if making no progress at high prioities. */ + if (sc->priority == 1 && !sc->nr_reclaimed) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } /* @@ -2201,67 +6213,66 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over high_wmark_pages(zone). - * Because: - * a) The caller may be trying to free *extra* pages to satisfy a higher-order - * allocation or - * b) The target zone may be at high_wmark_pages(zone) but the lower zones - * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' - * zone defense algorithm. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. - * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool aborted_reclaim = false; + gfp_t orig_mask; + pg_data_t *last_pgdat = NULL; + pg_data_t *first_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ - if (buffer_heads_over_limit) + orig_mask = sc->gfp_mask; + if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; + sc->reclaim_idx = gfp_zone(sc->gfp_mask); + } for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; + sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. */ - if (global_reclaim(sc)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + if (!cgroup_reclaim(sc)) { + if (!cpuset_zone_allowed(zone, + GFP_KERNEL | __GFP_HARDWALL)) + continue; + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + compaction_ready(zone, sc)) { + sc->compaction_ready = true; continue; - if (zone->all_unreclaimable && - sc->priority != DEF_PRIORITY) - continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if (compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } } + + /* + * Shrink each node in the zonelist once. If the + * zonelist is ordered by zone (not the default) then a + * node may be shrunk multiple times but in that case + * the user prefers lower zones being preserved. + */ + if (zone->zone_pgdat == last_pgdat) + continue; + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2269,43 +6280,47 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - sc->order, sc->gfp_mask, - &nr_soft_scanned); + nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat, + sc->order, sc->gfp_mask, + &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); + if (!first_pgdat) + first_pgdat = zone->zone_pgdat; + + /* See comment about same check for global reclaim above */ + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + shrink_node(zone->zone_pgdat, sc); } - return aborted_reclaim; -} + if (first_pgdat) + consider_reclaim_throttle(first_pgdat, sc); -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; } -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) +static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { - struct zoneref *z; - struct zone *zone; + struct lruvec *target_lruvec; + unsigned long refaults; - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (!zone->all_unreclaimable) - return false; - } + if (lru_gen_enabled()) + return; - return true; + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[WORKINGSET_ANON] = refaults; + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); + target_lruvec->refaults[WORKINGSET_FILE] = refaults; } /* @@ -2325,52 +6340,30 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc, - struct shrink_control *shrink) + struct scan_control *sc) { - unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; + int initial_priority = sc->priority; + pg_data_t *last_pgdat; struct zoneref *z; struct zone *zone; - unsigned long writeback_threshold; - bool aborted_reclaim; - +retry: delayacct_freepages_start(); - if (global_reclaim(sc)) - count_vm_event(ALLOCSTALL); + if (!cgroup_reclaim(sc)) + __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { - vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, - sc->priority); + if (!sc->proactive) + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); - - /* - * Don't shrink slabs when reclaiming memory from over limit - * cgroups but do shrink slab at least once when aborting - * reclaim for compaction to avoid unevenly scanning file/anon - * LRU pages over slab pages. - */ - if (global_reclaim(sc)) { - unsigned long lru_pages = 0; - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + shrink_zones(zonelist, sc); - lru_pages += zone_reclaimable_pages(zone); - } - - shrink_slab(shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } - total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2378,48 +6371,79 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; + } while (--sc->priority >= 0); - /* - * Try to write back as many pages as we just scanned. This - * tends to cause slow streaming writers to write data to the - * disk smoothly, at the dirtying rate, which is nice. But - * that's undesirable in laptop mode, where we *want* lumpy - * writeout. So in laptop mode, write out the whole world. - */ - writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; - if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, - WB_REASON_TRY_TO_FREE_PAGES); - sc->may_writepage = 1; + last_pgdat = NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, + sc->nodemask) { + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + + if (cgroup_reclaim(sc)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, + zone->zone_pgdat); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); } - } while (--sc->priority >= 0 && !aborted_reclaim); + } -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; + /* Aborted reclaim to try compaction? don't OOM, then */ + if (sc->compaction_ready) + return 1; + /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. + * In most cases, direct reclaimers can do partial walks + * through the cgroup tree to meet the reclaim goal while + * keeping latency low. Since the iterator state is shared + * among all direct reclaim invocations (to retain fairness + * among cgroups), though, high concurrency can result in + * individual threads not seeing enough cgroups to make + * meaningful forward progress. Avoid false OOMs in this case. */ - if (oom_killer_disabled) - return 0; + if (!sc->memcg_full_walk) { + sc->priority = initial_priority; + sc->memcg_full_walk = 1; + goto retry; + } - /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) - return 1; + /* + * We make inactive:active ratio decisions based on the node's + * composition of memory, but a restrictive reclaim_idx or a + * memory.low cgroup setting can exempt large amounts of + * memory from reclaim. Neither of which are very common, so + * instead of doing costly eligibility calculations of the + * entire cgroup subtree up front, we assume the estimates are + * good, and retry with forcible deactivation if that fails. + */ + if (sc->skipped_deactivate) { + sc->priority = initial_priority; + sc->force_deactivate = 1; + sc->skipped_deactivate = 0; + goto retry; + } - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) - return 1; + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (sc->memcg_low_skipped) { + sc->priority = initial_priority; + sc->force_deactivate = 0; + sc->memcg_low_reclaim = 1; + sc->memcg_low_skipped = 0; + goto retry; + } return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2427,18 +6451,28 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; - for (i = 0; i <= ZONE_NORMAL; i++) { - zone = &pgdat->node_zones[i]; + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + return true; + + for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { + if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES)) + continue; + pfmemalloc_reserve += min_wmark_pages(zone); - free_pages += zone_page_state(zone, NR_FREE_PAGES); + free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES); } + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - pgdat->classzone_idx = min(pgdat->classzone_idx, - (enum zone_type)ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); } @@ -2457,9 +6491,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { + struct zoneref *z; struct zone *zone; - int high_zoneidx = gfp_zone(gfp_mask); - pg_data_t *pgdat; + pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly @@ -2478,10 +6512,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (fatal_signal_pending(current)) goto out; - /* Check if the pfmemalloc reserves are ok */ - first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); - pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(gfp_mask), nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (allow_direct_reclaim(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) goto out; /* Account for the throttling */ @@ -2495,18 +6553,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ - if (!(gfp_mask & __GFP_FS)) { + if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); - - goto check_pending; - } - - /* Throttle until kswapd wakes the process */ - wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat), HZ); + else + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + allow_direct_reclaim(pgdat)); -check_pending: if (fatal_signal_pending(current)) return true; @@ -2519,69 +6573,74 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), - .may_writepage = !laptop_mode, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .may_unmap = 1, - .may_swap = 1, + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, .nodemask = nodemask, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; + + /* + * scan_control uses s8 fields for order, priority, and reclaim_idx. + * Confirm they are large enough for max values. + */ + BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX); + BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); + BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point. */ - if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; - trace_mm_vmscan_direct_reclaim_begin(order, - sc.may_writepage, - gfp_mask); + set_task_reclaim_state(current, &sc.reclaim_state); + trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + set_task_reclaim_state(current, NULL); return nr_reclaimed; } #ifdef CONFIG_MEMCG -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, +/* Only used by soft limit reclaim. Do not reuse for anything else. */ +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, - struct zone *zone, + pg_data_t *pgdat, unsigned long *nr_scanned) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .target_mem_cgroup = memcg, }; - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + + WARN_ON_ONCE(!current->reclaim_state); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.may_writepage, sc.gfp_mask); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. - * if we don't reclaim here, the shrink_zone from balance_pgdat + * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ @@ -2590,143 +6649,189 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); *nr_scanned = sc.nr_scanned; + return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, gfp_t gfp_mask, - bool noswap) + unsigned int reclaim_options, + int *swappiness) { - struct zonelist *zonelist; unsigned long nr_reclaimed; - int nid; + unsigned int noreclaim_flag; struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .proactive_swappiness = swappiness, + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .reclaim_idx = MAX_NR_ZONES - 1, + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = !noswap, - .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), + .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; - /* - * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't - * take care of from where we get pages. So the node where we start the - * scan does not need to be the current node. + * Traverse the ZONELIST_FALLBACK zonelist of the current node to put + * equal pressure on all the nodes. This is based on the assumption that + * the reclaim does not bail out early. */ - nid = mem_cgroup_select_victim_node(memcg); - - zonelist = NODE_DATA(nid)->node_zonelists; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - trace_mm_vmscan_memcg_reclaim_begin(0, - sc.may_writepage, - sc.gfp_mask); + set_task_reclaim_state(current, &sc.reclaim_state); + trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + set_task_reclaim_state(current, NULL); return nr_reclaimed; } +#else +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options, + int *swappiness) +{ + return 0; +} #endif -static void age_active_anon(struct zone *zone, struct scan_control *sc) +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; + struct lruvec *lruvec; - if (!total_swap_pages) + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); return; + } - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!can_age_anon_pages(lruvec, sc)) + return; - if (inactive_anon_is_low(lruvec)) - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, - sc, LRU_ACTIVE_ANON); + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + return; + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + lruvec = mem_cgroup_lruvec(memcg, pgdat); + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) - return false; + int i; + struct zone *zone; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - !compaction_suitable(zone, order)) - return false; + /* + * Check for watermark boosts top-down as the higher zones + * are more likely to be boosted. Both watermarks and boosts + * should not be checked at the same time as reclaim would + * start prematurely when there is no boosting and a lower + * zone is balanced. + */ + for (i = highest_zoneidx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; - return true; + if (zone->watermark_boost) + return true; + } + + return false; } /* - * pgdat_balanced() is used when checking if a node is balanced. - * - * For order-0, all zones must be balanced! - * - * For high-order allocations only zones that meet watermarks and are in a - * zone allowed by the callers classzone_idx are added to balanced_pages. The - * total of balanced pages must be at least 25% of the zones allowed by - * classzone_idx for the node to be considered balanced. Forcing all zones to - * be balanced for high orders can cause excessive reclaim when there are - * imbalanced zones. - * The choice of 25% is due to - * o a 16M DMA zone that is balanced will not balance a zone on any - * reasonable sized machine - * o On all other machines, the top zone must be at least a reasonable - * percentage of the middle zones. For example, on 32-bit x86, highmem - * would need to be at least 256M for it to be balance a whole node. - * Similarly, on x86-64 the Normal zone would need to be at least 1G - * to balance a node on its own. These seemed like reasonable ratios. + * Returns true if there is an eligible zone balanced for the request order + * and highest_zoneidx */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { - unsigned long managed_pages = 0; - unsigned long balanced_pages = 0; int i; + unsigned long mark = -1; + struct zone *zone; - /* Check the watermark levels */ - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; + /* + * Check watermarks bottom-up as lower zones are more likely to + * meet watermarks. + */ + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + enum zone_stat_item item; + unsigned long free_pages; - if (!populated_zone(zone)) - continue; + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + mark = promo_wmark_pages(zone); + else + mark = high_wmark_pages(zone); - managed_pages += zone->managed_pages; + /* + * In defrag_mode, watermarks must be met in whole + * blocks to avoid polluting allocator fallbacks. + * + * However, kswapd usually cannot accomplish this on + * its own and needs kcompactd support. Once it's + * reclaimed a compaction gap, and kswapd_shrink_node + * has dropped order, simply ensure there are enough + * base pages for compaction, wake kcompactd & sleep. + */ + if (defrag_mode && order) + item = NR_FREE_PAGES_BLOCKS; + else + item = NR_FREE_PAGES; /* - * A special case here: + * When there is a high number of CPUs in the system, + * the cumulative error from the vmstat per-cpu cache + * can blur the line between the watermarks. In that + * case, be safe and get an accurate snapshot. * - * balance_pgdat() skips over all_unreclaimable after - * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well! + * TODO: NR_FREE_PAGES_BLOCKS moves in steps of + * pageblock_nr_pages, while the vmstat pcp threshold + * is limited to 125. On many configurations that + * counter won't actually be per-cpu cached. But keep + * things simple for now; revisit when somebody cares. */ - if (zone->all_unreclaimable) { - balanced_pages += zone->managed_pages; - continue; - } + free_pages = zone_page_state(zone, item); + if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) + free_pages = zone_page_state_snapshot(zone, item); - if (zone_balanced(zone, order, 0, i)) - balanced_pages += zone->managed_pages; - else if (!order) - return false; + if (__zone_watermark_ok(zone, order, mark, highest_zoneidx, + 0, free_pages)) + return true; } - if (order) - return balanced_pages >= (managed_pages >> 2); - else + /* + * If a node has no managed zone within highest_zoneidx, it does not + * need balancing by definition. This can happen if a zone-restricted + * allocation tries to wake a remote kswapd. + */ + if (mark == -1) return true; + + return false; +} + +/* Clear pgdat state for congested, dirty or under writeback. */ +static void clear_pgdat_congested(pg_data_t *pgdat) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* @@ -2735,228 +6840,225 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, - int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) { - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ - if (remaining) - return false; - /* - * There is a potential race between when kswapd checks its watermarks - * and a process gets throttled. There is also a potential race if - * processes get throttled, kswapd wakes, a large process exits therby - * balancing the zones that causes kswapd to miss a wakeup. If kswapd - * is going to sleep, no process should be sleeping on pfmemalloc_wait - * so wake them now if necessary. If necessary, processes will wake - * kswapd and get throttled again + * The throttled processes are normally woken up in balance_pgdat() as + * soon as allow_direct_reclaim() is true. But there is a potential + * race between when kswapd checks the watermarks and a process gets + * throttled. There is also a potential race if processes get + * throttled, kswapd wakes, a large process exits thereby balancing the + * zones, which causes kswapd to exit balance_pgdat() before reaching + * the wake up checks. If kswapd is going to sleep, no process should + * be sleeping on pfmemalloc_wait, so wake them now if necessary. If + * the wake up is premature, processes will wake kswapd and get + * throttled again. The difference from wake ups in balance_pgdat() is + * that here we are under prepare_to_wait(). */ - if (waitqueue_active(&pgdat->pfmemalloc_wait)) { - wake_up(&pgdat->pfmemalloc_wait); - return false; + if (waitqueue_active(&pgdat->pfmemalloc_wait)) + wake_up_all(&pgdat->pfmemalloc_wait); + + /* Hopeless node, leave it to direct reclaim */ + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + return true; + + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { + clear_pgdat_congested(pgdat); + return true; } - return pgdat_balanced(pgdat, order, classzone_idx); + return false; } /* - * kswapd shrinks the zone by the number of pages required to reach - * the high watermark. + * kswapd shrinks a node of pages that are at or below the highest usable + * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ -static bool kswapd_shrink_zone(struct zone *zone, - int classzone_idx, - struct scan_control *sc, - unsigned long lru_pages, - unsigned long *nr_attempted) -{ - unsigned long nr_slab; - int testorder = sc->order; - unsigned long balance_gap; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct shrink_control shrink = { - .gfp_mask = sc->gfp_mask, - }; - bool lowmem_pressure; - - /* Reclaim above the high watermark. */ - sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); +static bool kswapd_shrink_node(pg_data_t *pgdat, + struct scan_control *sc) +{ + struct zone *zone; + int z; + unsigned long nr_reclaimed = sc->nr_reclaimed; - /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order) != - COMPACT_SKIPPED) - testorder = 0; + /* Reclaim a number of pages proportional to the number of zones */ + sc->nr_to_reclaim = 0; + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { + sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); + } /* - * We put equal pressure on every zone, unless one zone has way too - * many pages free already. The "too many pages" is defined as the - * high wmark plus a "gap" where the gap is either the low - * watermark or 1% of the zone, whichever is smaller. + * Historically care was taken to put equal pressure on all zones but + * now pressure is applied based on node LRU order. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + shrink_node(pgdat, sc); /* - * If there is no low memory pressure or the zone is balanced then no - * reclaim is necessary + * Fragmentation may mean that the system cannot be rebalanced for + * high-order allocations. If twice the allocation size has been + * reclaimed then recheck watermarks only at order-0 to prevent + * excessive reclaim. Assume that a process requested a high-order + * can direct reclaim/compact. */ - lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, - balance_gap, classzone_idx)) - return true; - - shrink_zone(zone, sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; + if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) + sc->order = 0; - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; + /* account for progress from mm_account_reclaimed_pages() */ + return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim; +} - zone_clear_flag(zone, ZONE_WRITEBACK); +/* Page allocator PCP high watermark is lowered if reclaim is active. */ +static inline void +update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) +{ + int i; + struct zone *zone; - /* - * If a zone reaches its high watermark, consider it to be no longer - * congested. It's possible there are dirty pages backed by congested - * BDIs but as pressure is relieved, speculatively avoid congestion - * waits. - */ - if (!zone->all_unreclaimable && - zone_balanced(zone, testorder, 0, classzone_idx)) { - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + if (active) + set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); + else + clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); } +} + +static inline void +set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, true); +} - return sc->nr_scanned >= sc->nr_to_reclaim; +static inline void +clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, false); } /* - * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at high_wmark_pages(zone). - * - * Returns the final order kswapd was reclaiming at + * For kswapd, balance_pgdat() will reclaim pages across a node from zones + * that are eligible for use by the caller until at least one zone is + * balanced. * - * There is special handling here for zones which are full of pinned pages. - * This can happen if the pages are all mlocked, or if they are all used by - * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. - * What we do is to detect the case where all pages in the zone have been - * scanned twice and there has been zero successful reclaim. Mark the zone as - * dead and from now on, only perform a short scan. Basically we're polling - * the zone for when the problem goes away. + * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is - * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the - * lower zones regardless of the number of free pages in the lower zones. This - * interoperates with the page allocator fallback scheme to ensure that aging - * of pages is balanced across the zones. + * found to have free_pages <= high_wmark_pages(zone), any page in that zone + * or lower is eligible for reclaim until at least one usable zone is + * balanced. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; + unsigned long nr_boost_reclaim; + unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; + bool boosted; + struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, - .may_unmap = 1, - .may_swap = 1, - .may_writepage = !laptop_mode, .order = order, - .target_mem_cgroup = NULL, + .may_unmap = 1, }; + + set_task_reclaim_state(current, &sc.reclaim_state); + psi_memstall_enter(&pflags); + __fs_reclaim_acquire(_THIS_IP_); + count_vm_event(PAGEOUTRUN); + /* + * Account for the reclaim boost. Note that the zone boost is left in + * place so that parallel allocations that are near the watermark will + * stall or direct reclaim until kswapd is finished. + */ + nr_boost_reclaim = 0; + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + nr_boost_reclaim += zone->watermark_boost; + zone_boosts[i] = zone->watermark_boost; + } + boosted = nr_boost_reclaim; + +restart: + set_reclaim_active(pgdat, highest_zoneidx); + sc.priority = DEF_PRIORITY; do { - unsigned long lru_pages = 0; - unsigned long nr_attempted = 0; + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); + bool balanced; + bool ret; + bool was_frozen; - sc.nr_reclaimed = 0; + sc.reclaim_idx = highest_zoneidx; /* - * Scan in the highmem->dma direction for the highest - * zone which needs scanning + * If the number of buffer_heads exceeds the maximum allowed + * then consider reclaiming from all zones. This has a dual + * purpose -- on 64-bit systems it is expected that + * buffer_heads are stripped during active rotation. On 32-bit + * systems, highmem pages can pin lowmem memory and shrinking + * buffers can relieve lowmem pressure. Reclaim may still not + * go ahead if all eligible zones for the original allocation + * request are balanced to avoid excessive reclaim from kswapd. */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) - continue; - - /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. - */ - age_active_anon(zone, &sc); + if (buffer_heads_over_limit) { + for (i = MAX_NR_ZONES - 1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; - /* - * If the number of buffer_heads in the machine - * exceeds the maximum allowed level and this node - * has a highmem zone, force kswapd to reclaim from - * it to relieve lowmem pressure. - */ - if (buffer_heads_over_limit && is_highmem_idx(i)) { - end_zone = i; + sc.reclaim_idx = i; break; } + } - if (!zone_balanced(zone, order, 0, 0)) { - end_zone = i; - break; - } else { - /* - * If balanced, clear the dirty and congested - * flags - */ - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); - } + /* + * If the pgdat is imbalanced then ignore boosting and preserve + * the watermarks for a later time and restart. Note that the + * zone watermarks will be still reset at the end of balancing + * on the grounds that the normal reclaim should be enough to + * re-evaluate if boosting is required when kswapd next wakes. + */ + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); + if (!balanced && nr_boost_reclaim) { + nr_boost_reclaim = 0; + goto restart; } - if (i < 0) + /* + * If boosting is not active then only reclaim if there are no + * eligible zones. Note that sc.reclaim_idx is not used as + * buffer_heads_over_limit may have adjusted it. + */ + if (!nr_boost_reclaim && balanced) goto out; - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; + /* Limit the priority of boosting to avoid reclaim writeback */ + if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) + raise_priority = false; - lru_pages += zone_reclaimable_pages(zone); + /* + * Do not writeback or swap pages for boosted reclaim. The + * intent is to relieve pressure not issue sub-optimal IO + * from reclaim context. If no pages are reclaimed, the + * reclaim will be aborted. + */ + sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_swap = !nr_boost_reclaim; - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; - } + /* + * Do some background aging, to give pages a chance to be + * referenced before reclaiming. All pages are rotated + * regardless of classzone as this is about consistent aging. + */ + kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage @@ -2965,46 +7067,20 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; + /* Call soft limit reclaim before calling shrink_node. */ + sc.nr_scanned = 0; + nr_soft_scanned = 0; + nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, sc.order, + sc.gfp_mask, &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* - * Now scan the zone in the dma->highmem direction, stopping - * at the last zone which needs scanning. - * - * We do this because the page allocator works in the opposite - * direction. This prevents the page allocator from allocating - * pages behind kswapd's direction of progress, which would - * cause too much scanning of the lower zones. + * There should be no need to raise the scanning priority if + * enough pages are already being scanned that that high + * watermark would be met at 100% efficiency. */ - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) - continue; - - sc.nr_scanned = 0; - - nr_soft_scanned = 0; - /* - * Call soft limit reclaim before calling shrink_zone. - */ - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - order, sc.gfp_mask, - &nr_soft_scanned); - sc.nr_reclaimed += nr_soft_reclaimed; - - /* - * There should be no need to raise the scanning - * priority if enough pages are already being scanned - * that that high watermark would be met at 100% - * efficiency. - */ - if (kswapd_shrink_zone(zone, end_zone, &sc, - lru_pages, &nr_attempted)) - raise_priority = false; - } + if (kswapd_shrink_node(pgdat, &sc)) + raise_priority = false; /* * If the low watermark is met there is no need for processes @@ -3012,52 +7088,109 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) - wake_up(&pgdat->pfmemalloc_wait); - - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; + allow_direct_reclaim(pgdat)) + wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + __fs_reclaim_release(_THIS_IP_); + ret = kthread_freezable_should_stop(&was_frozen); + __fs_reclaim_acquire(_THIS_IP_); + if (was_frozen || ret) break; /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); /* - * Raise priority if scanning rate is too low or there was no - * progress in reclaiming pages + * If reclaim made no progress for a boost, stop reclaim as + * IO cannot be queued and it could be an infinite loop in + * extreme circumstances. */ - if (raise_priority || !sc.nr_reclaimed) + if (nr_boost_reclaim && !nr_reclaimed) + break; + + if (raise_priority || !nr_reclaimed) sc.priority--; - } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + } while (sc.priority >= 1); + + /* + * Restart only if it went through the priority loop all the way, + * but cache_trim_mode didn't work. + */ + if (!sc.nr_reclaimed && sc.priority < 1 && + !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) { + sc.no_cache_trim_mode = 1; + goto restart; + } + + /* + * If the reclaim was boosted, we might still be far from the + * watermark_high at this point. We need to avoid increasing the + * failure count to prevent the kswapd thread from stopping. + */ + if (!sc.nr_reclaimed && !boosted) + atomic_inc(&pgdat->kswapd_failures); out: + clear_reclaim_active(pgdat, highest_zoneidx); + + /* If reclaim was boosted, account for the reclaim done in this pass */ + if (boosted) { + unsigned long flags; + + for (i = 0; i <= highest_zoneidx; i++) { + if (!zone_boosts[i]) + continue; + + /* Increments are under the zone lock */ + zone = pgdat->node_zones + i; + spin_lock_irqsave(&zone->lock, flags); + zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* + * As there is now likely space, wakeup kcompact to defragment + * pageblocks. + */ + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); + } + + snapshot_refaults(NULL, pgdat); + __fs_reclaim_release(_THIS_IP_); + psi_memstall_leave(&pflags); + set_task_reclaim_state(current, NULL); + /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the order kswapd stopped reclaiming at as + * prepare_kswapd_sleep() takes it into account. If another caller + * entered the allocator slow path while kswapd was awake, order will + * remain at the higher level. */ - *classzone_idx = end_zone; - return order; + return sc.order; +} + +/* + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. + */ +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) +{ + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); + + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, + unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3067,9 +7200,44 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + /* + * Try to sleep for a short interval. Note that kcompactd will only be + * woken if it is possible to sleep for a short interval. This is + * deliberate on the assumption that if reclaim cannot keep an + * eligible zone balanced that it's also unlikely that compaction will + * succeed. + */ + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); + remaining = schedule_timeout(HZ/10); + + /* + * If woken prematurely then reset kswapd_highest_zoneidx and + * order. The values will either be from a wakeup request or + * the previous request that slept prematurely. + */ + if (remaining) { + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); + + if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) + WRITE_ONCE(pgdat->kswapd_order, reclaim_order); + } + finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } @@ -3078,7 +7246,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (!remaining && + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3091,14 +7260,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - /* - * Compaction records what page blocks it recently failed to - * isolate pages from and skips them in the future scanning. - * When kswapd is going to sleep, it is reasonable to assume - * that pages and compaction may succeed so reset the cache. - */ - reset_isolation_suitable(pgdat); - if (!kthread_should_stop()) schedule(); @@ -3127,24 +7288,11 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ static int kswapd(void *p) { - unsigned long order, new_order; - unsigned balanced_order; - int classzone_idx, new_classzone_idx; - int balanced_classzone_idx; - pg_data_t *pgdat = (pg_data_t*)p; + unsigned int alloc_order, reclaim_order; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - struct reclaim_state reclaim_state = { - .reclaimed_slab = 0, - }; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - lockdep_set_current_reclaim_state(GFP_KERNEL); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); - current->reclaim_state = &reclaim_state; - /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it @@ -3157,126 +7305,111 @@ static int kswapd(void *p) * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + tsk->flags |= PF_MEMALLOC | PF_KSWAPD; set_freezable(); - order = new_order = 0; - balanced_order = 0; - classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; - balanced_classzone_idx = classzone_idx; + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { - bool ret; + bool was_frozen; - /* - * If the last balance_pgdat was unsuccessful it's unlikely a - * new request of a similar or harder type will succeed soon - * so consider going to sleep on the basis we reclaimed at - */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); - if (order < new_order || classzone_idx > new_classzone_idx) { - /* - * Don't sleep if someone wants a larger 'order' - * allocation or has tigher zone constraints - */ - order = new_order; - classzone_idx = new_classzone_idx; - } else { - kswapd_try_to_sleep(pgdat, balanced_order, - balanced_classzone_idx); - order = pgdat->kswapd_max_order; - classzone_idx = pgdat->classzone_idx; - new_order = order; - new_classzone_idx = classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } - - ret = try_to_freeze(); - if (kthread_should_stop()) +kswapd_try_sleep: + kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, + highest_zoneidx); + + /* Read the new order and highest_zoneidx */ + alloc_order = READ_ONCE(pgdat->kswapd_order); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + + if (kthread_freezable_should_stop(&was_frozen)) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) { - trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = classzone_idx; - balanced_order = balance_pgdat(pgdat, order, - &balanced_classzone_idx); - } + if (was_frozen) + continue; + + /* + * Reclaim begins at the requested order but if a high-order + * reclaim fails then kswapd falls back to reclaiming for + * order-0. If that happens, kswapd will consider sleeping + * for the order it finished reclaiming at (reclaim_order) + * but kcompactd is woken to compact for the original + * request (alloc_order). + */ + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, + alloc_order); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); + if (reclaim_order < alloc_order) + goto kswapd_try_sleep; } - current->reclaim_state = NULL; + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); + return 0; } /* - * A zone is low on free memory, so wake its kswapd task to service it. + * A zone is low on free memory or too fragmented for high-order memory. If + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's + * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim + * has failed or is not needed, still wake up kcompactd if only compaction is + * needed. */ -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, + enum zone_type highest_zoneidx) { pg_data_t *pgdat; + enum zone_type curr_idx; - if (!populated_zone(zone)) + if (!managed_zone(zone)) return; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - return; - pgdat = zone->zone_pgdat; - if (pgdat->kswapd_max_order < order) { - pgdat->kswapd_max_order = order; - pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); - } - if (!waitqueue_active(&pgdat->kswapd_wait)) + if (!cpuset_zone_allowed(zone, gfp_flags)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) - return; - - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); - wake_up_interruptible(&pgdat->kswapd_wait); -} - -/* - * The reclaimable count would be mostly accurate. - * The less reclaimable pages may be - * - mlocked pages, which will be moved to unevictable list when encountered - * - mapped pages, which may require several travels to be reclaimed - * - dirty pages, which is not "instantly" reclaimable - */ -unsigned long global_reclaimable_pages(void) -{ - int nr; - nr = global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_INACTIVE_ANON); + pgdat = zone->zone_pgdat; + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - return nr; -} + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; + if (READ_ONCE(pgdat->kswapd_order) < order) + WRITE_ONCE(pgdat->kswapd_order, order); - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + if (!waitqueue_active(&pgdat->kswapd_wait)) + return; - if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); + /* Hopeless node, leave it to direct reclaim if possible */ + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { + /* + * There may be plenty of free memory available, but it's too + * fragmented for high-order allocations. Wake up kcompactd + * and rely on compaction_suitable() to determine if it's + * needed. If it fails, it will defer subsequent attempts to + * ratelimit its work. + */ + if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) + wakeup_kcompactd(pgdat, order, highest_zoneidx); + return; + } - return nr; + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, + gfp_flags); + wake_up_interruptible(&pgdat->kswapd_wait); } #ifdef CONFIG_HIBERNATION @@ -3290,100 +7423,97 @@ unsigned long zone_reclaimable_pages(struct zone *zone) */ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - struct task_struct *p = current; unsigned long nr_reclaimed; + unsigned int noreclaim_flag; - p->flags |= PF_MEMALLOC; - lockdep_set_current_reclaim_state(sc.gfp_mask); - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); + set_task_reclaim_state(current, &sc.reclaim_state); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + set_task_reclaim_state(current, NULL); + memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ -/* It's optimal to keep kswapds on the same CPUs as their memory, but - not required for correctness. So if the last cpu in a node goes - away, we get changed to run anywhere: as the first one comes back, - restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - int nid; - - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; - - mask = cpumask_of_node(pgdat->node_id); - - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - set_cpus_allowed_ptr(pgdat->kswapd, mask); - } - } - return NOTIFY_OK; -} - /* * This kswapd start function will be called by init and node-hot-add. - * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +void __meminit kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - int ret = 0; - - if (pgdat->kswapd) - return 0; - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ - BUG_ON(system_state == SYSTEM_BOOTING); - pr_err("Failed to start kswapd on node %d\n", nid); - ret = PTR_ERR(pgdat->kswapd); - pgdat->kswapd = NULL; + pgdat_kswapd_lock(pgdat); + if (!pgdat->kswapd) { + pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + pr_err("Failed to start kswapd on node %d,ret=%ld\n", + nid, PTR_ERR(pgdat->kswapd)); + BUG_ON(system_state < SYSTEM_RUNNING); + pgdat->kswapd = NULL; + } else { + wake_up_process(pgdat->kswapd); + } } - return ret; + pgdat_kswapd_unlock(pgdat); } /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold lock_memory_hotplug(). + * be holding mem_hotplug_begin/done(). */ -void kswapd_stop(int nid) +void __meminit kswapd_stop(int nid) { - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + pg_data_t *pgdat = NODE_DATA(nid); + struct task_struct *kswapd; + pgdat_kswapd_lock(pgdat); + kswapd = pgdat->kswapd; if (kswapd) { kthread_stop(kswapd); - NODE_DATA(nid)->kswapd = NULL; + pgdat->kswapd = NULL; } + pgdat_kswapd_unlock(pgdat); } +static const struct ctl_table vmscan_sysctl_table[] = { + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO_HUNDRED, + }, +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +#endif +}; + static int __init kswapd_init(void) { int nid; @@ -3391,7 +7521,7 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - hotcpu_notifier(cpu_callback, 0); + register_sysctl_init("vm", vmscan_sysctl_table); return 0; } @@ -3399,27 +7529,22 @@ module_init(kswapd_init) #ifdef CONFIG_NUMA /* - * Zone reclaim mode + * Node reclaim mode * - * If non-zero call zone_reclaim when the number of free pages falls below + * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ -int zone_reclaim_mode __read_mostly; - -#define RECLAIM_OFF 0 -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +int node_reclaim_mode __read_mostly; /* - * Priority for ZONE_RECLAIM. This determines the fraction of pages + * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ -#define ZONE_RECLAIM_PRIORITY 4 +#define NODE_RECLAIM_PRIORITY 4 /* - * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; @@ -3430,11 +7555,11 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; -static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { - unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); - unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_FILE); + unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); + unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than @@ -3445,25 +7570,27 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { - long nr_pagecache_reclaimable; - long delta = 0; + unsigned long nr_pagecache_reclaimable; + unsigned long delta = 0; /* - * If RECLAIM_SWAP is set, then all file pages are considered + * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about - * pages like swapcache and zone_unmapped_file_pages() provides + * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_SWAP) - nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + if (node_reclaim_mode & RECLAIM_UNMAP) + nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else - nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); - /* If we can't clean pages, remove dirty pages from consideration */ - if (!(zone_reclaim_mode & RECLAIM_WRITE)) - delta += zone_page_state(zone, NR_FILE_DIRTY); + /* + * Since we can't clean folios through reclaim, remove dirty file + * folios from consideration. + */ + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) @@ -3473,271 +7600,285 @@ static long zone_pagecache_reclaimable(struct zone *zone) } /* - * Try to free up some pages from this zone through reclaim. + * Try to free up some pages from this node through reclaim. */ -static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, + unsigned long nr_pages, + struct scan_control *sc) { - /* Minimum pages needed in order to stay on node */ - const unsigned long nr_pages = 1 << order; struct task_struct *p = current; - struct reclaim_state reclaim_state; - struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, - .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), - .order = order, - .priority = ZONE_RECLAIM_PRIORITY, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; - unsigned long nr_slab_pages0, nr_slab_pages1; + unsigned int noreclaim_flag; + unsigned long pflags; + + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order, + sc->gfp_mask); cond_resched(); + psi_memstall_enter(&pflags); + delayacct_freepages_start(); + fs_reclaim_acquire(sc->gfp_mask); /* - * We need to be able to allocate from the reserves for RECLAIM_SWAP - * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_SWAP. + * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ - p->flags |= PF_MEMALLOC | PF_SWAPWRITE; - lockdep_set_current_reclaim_state(gfp_mask); - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + noreclaim_flag = memalloc_noreclaim_save(); + set_task_reclaim_state(p, &sc->reclaim_state); - if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { /* - * Free memory by calling shrink zone with increasing + * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc); - } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); + shrink_node(pgdat, sc); + } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0); } - nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages0 > zone->min_slab_pages) { - /* - * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we take the current - * number of slab pages and shake the slab until it is reduced - * by the same nr_pages that we used for reclaiming unmapped - * pages. - * - * Note that shrink_slab will free memory on all zones and may - * take a long time. - */ - for (;;) { - unsigned long lru_pages = zone_reclaimable_pages(zone); - - /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) - break; - - /* Freed enough memory */ - nr_slab_pages1 = zone_page_state(zone, - NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) - break; - } + set_task_reclaim_state(p, NULL); + memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc->gfp_mask); + delayacct_freepages_end(); + psi_memstall_leave(&pflags); - /* - * Update nr_reclaimed by the number of slab pages we - * reclaimed from this zone. - */ - nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 < nr_slab_pages0) - sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; - } + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); - p->reclaim_state = NULL; - current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - lockdep_clear_current_reclaim_state(); - return sc.nr_reclaimed >= nr_pages; + return sc->nr_reclaimed; } -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { - int node_id; int ret; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = current_gfp_context(gfp_mask), + .order = order, + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), + .may_swap = 1, + .reclaim_idx = gfp_zone(gfp_mask), + }; /* - * Zone reclaim reclaims unmapped file backed pages and + * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately - * thrown out if the zone is overallocated. So we do not reclaim - * if less than a specified percentage of the zone is used by + * thrown out if the node is overallocated. So we do not reclaim + * if less than a specified percentage of the node is used by * unmapped file backed pages. */ - if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) - return ZONE_RECLAIM_FULL; - - if (zone->all_unreclaimable) - return ZONE_RECLAIM_FULL; + if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= + pgdat->min_slab_pages) + return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return ZONE_RECLAIM_NOSCAN; + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) + return NODE_RECLAIM_NOSCAN; /* - * Only run zone reclaim on the local zone or on zones that do not + * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ - node_id = zone_to_nid(zone); - if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return ZONE_RECLAIM_NOSCAN; + if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) + return NODE_RECLAIM_NOSCAN; - if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return ZONE_RECLAIM_NOSCAN; + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + return NODE_RECLAIM_NOSCAN; - ret = __zone_reclaim(zone, gfp_mask, order); - zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages; + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); - if (!ret) + if (ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS); + else count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); return ret; } -#endif -/* - * page_evictable - test whether a page is evictable - * @page: the page to test - * - * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. - * - * Reasons page might not be evictable: - * (1) page's mapping marked unevictable - * (2) page is part of an mlocked VMA - * - */ -int page_evictable(struct page *page) +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, + MEMORY_RECLAIM_NULL, +}; +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) { - return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + gfp_t gfp_mask = GFP_KERNEL; + + if (!buf || (!memcg && !pgdat) || (memcg && pgdat)) + return -EINVAL; + + buf = strstrip(buf); + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || + swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; + default: + return -EINVAL; + } + } + + while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages. + */ + if (!nr_retries) + lru_add_drain_all(); + + if (memcg) { + unsigned int reclaim_options; + + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + reclaimed = try_to_free_mem_cgroup_pages(memcg, + batch_size, gfp_mask, + reclaim_options, + swappiness == -1 ? NULL : &swappiness); + } else { + struct scan_control sc = { + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), + .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), + .may_unmap = 1, + .may_swap = 1, + .proactive = 1, + }; + + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, + &pgdat->flags)) + return -EBUSY; + + reclaimed = __node_reclaim(pgdat, gfp_mask, + batch_size, &sc); + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + } + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return 0; } -#ifdef CONFIG_SHMEM +#endif + /** - * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list - * @pages: array of pages to check - * @nr_pages: number of pages to check - * - * Checks pages for evictability and moves them to the appropriate lru list. + * check_move_unevictable_folios - Move evictable folios to appropriate zone + * lru list + * @fbatch: Batch of lru folios to check. * - * This function is only used for SysV IPC SHM_UNLOCK. + * Checks folios for evictability, if an evictable folio is in the unevictable + * lru list, moves it to the appropriate evictable lru list. This function + * should be only used for lru folios. */ -void check_move_unevictable_pages(struct page **pages, int nr_pages) +void check_move_unevictable_folios(struct folio_batch *fbatch) { - struct lruvec *lruvec; - struct zone *zone = NULL; + struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pages[i]; - struct zone *pagezone; + for (i = 0; i < fbatch->nr; i++) { + struct folio *folio = fbatch->folios[i]; + int nr_pages = folio_nr_pages(folio); - pgscanned++; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - lruvec = mem_cgroup_page_lruvec(page, zone); + pgscanned += nr_pages; - if (!PageLRU(page) || !PageUnevictable(page)) + /* block memcg migration while the folio moves between lrus */ + if (!folio_test_clear_lru(folio)) continue; - if (page_evictable(page)) { - enum lru_list lru = page_lru_base_type(page); - - VM_BUG_ON(PageActive(page)); - ClearPageUnevictable(page); - del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); - add_page_to_lru_list(page, lruvec, lru); - pgrescued++; + lruvec = folio_lruvec_relock_irq(folio, lruvec); + if (folio_evictable(folio) && folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); + pgrescued += nr_pages; } + folio_set_lru(folio); } - if (zone) { + if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&zone->lru_lock); + unlock_page_lruvec_irq(lruvec); + } else if (pgscanned) { + count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } } -#endif /* CONFIG_SHMEM */ +EXPORT_SYMBOL_GPL(check_move_unevictable_folios); -static void warn_scan_unevictable_pages(void) +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t reclaim_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { - printk_once(KERN_WARNING - "%s: The scan_unevictable_pages sysctl/node-interface has been " - "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n", - current->comm); -} + int ret, nid = dev->id; -/* - * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of - * all nodes' unevictable lists for evictable pages - */ -unsigned long scan_unevictable_pages; - -int scan_unevictable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - warn_scan_unevictable_pages(); - proc_doulongvec_minmax(table, write, buffer, length, ppos); - scan_unevictable_pages = 0; - return 0; + ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid)); + return ret ? -EAGAIN : count; } -#ifdef CONFIG_NUMA -/* - * per node 'scan_unevictable_pages' attribute. On demand re-scan of - * a specified node's per zone unevictable lists for evictable pages. - */ - -static ssize_t read_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - warn_scan_unevictable_pages(); - return sprintf(buf, "0\n"); /* always zero; should fit... */ -} - -static ssize_t write_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - warn_scan_unevictable_pages(); - return 1; -} - - -static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, - read_scan_unevictable_node, - write_scan_unevictable_node); - -int scan_unevictable_register_node(struct node *node) +static DEVICE_ATTR_WO(reclaim); +int reclaim_register_node(struct node *node) { - return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); + return device_create_file(&node->dev, &dev_attr_reclaim); } -void scan_unevictable_unregister_node(struct node *node) +void reclaim_unregister_node(struct node *node) { - device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); + return device_remove_file(&node->dev, &dev_attr_reclaim); } #endif |
