diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 3236 |
1 files changed, 1494 insertions, 1742 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 1080209a568b..900c74b6aa62 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,6 @@ #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> -#include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -57,6 +56,8 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> +#include <linux/mmu_notifier.h> +#include <linux/parser.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -93,6 +94,9 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; + /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */ + int *proactive_swappiness; + /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -109,7 +113,13 @@ struct scan_control { /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; - /* Proactive reclaim invoked by userspace through memory.reclaim */ + /* Not allow cache_trim_mode to be turned on as part of reclaim? */ + unsigned int no_cache_trim_mode:1; + + /* Has cache_trim_mode failed at least once? */ + unsigned int cache_trim_mode_failed:1; + + /* Proactive reclaim invoked by userspace */ unsigned int proactive:1; /* @@ -123,6 +133,9 @@ struct scan_control { unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; + /* Shared cgroup tree walk failed, rescan the whole tree */ + unsigned int memcg_full_walk:1; + unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ @@ -184,250 +197,11 @@ struct scan_control { #endif /* - * From 0 .. 200. Higher means more swappy. + * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ int vm_swappiness = 60; -LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); - #ifdef CONFIG_MEMCG -static int shrinker_nr_max; - -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ -static inline int shrinker_map_size(int nr_items) -{ - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); -} - -static inline int shrinker_defer_size(int nr_items) -{ - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); -} - -static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, - int nid) -{ - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); -} - -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size, - int new_nr_max) -{ - struct shrinker_info *new, *old; - struct mem_cgroup_per_node *pn; - int nid; - int size = map_size + defer_size; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - old = shrinker_info_protected(memcg, nid); - /* Not yet online memcg */ - if (!old) - return 0; - - /* Already expanded this shrinker_info */ - if (new_nr_max <= old->map_nr_max) - continue; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - new->nr_deferred = (atomic_long_t *)(new + 1); - new->map = (void *)new->nr_deferred + defer_size; - new->map_nr_max = new_nr_max; - - /* map: set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_map_size); - memset((void *)new->map + old_map_size, 0, map_size - old_map_size); - /* nr_deferred: copy old values, clear all new values */ - memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); - memset((void *)new->nr_deferred + old_defer_size, 0, - defer_size - old_defer_size); - - rcu_assign_pointer(pn->shrinker_info, new); - kvfree_rcu(old, rcu); - } - - return 0; -} - -void free_shrinker_info(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct shrinker_info *info; - int nid; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - info = rcu_dereference_protected(pn->shrinker_info, true); - kvfree(info); - rcu_assign_pointer(pn->shrinker_info, NULL); - } -} - -int alloc_shrinker_info(struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - int nid, size, ret = 0; - int map_size, defer_size = 0; - - down_write(&shrinker_rwsem); - map_size = shrinker_map_size(shrinker_nr_max); - defer_size = shrinker_defer_size(shrinker_nr_max); - size = map_size + defer_size; - for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); - if (!info) { - free_shrinker_info(memcg); - ret = -ENOMEM; - break; - } - info->nr_deferred = (atomic_long_t *)(info + 1); - info->map = (void *)info->nr_deferred + defer_size; - info->map_nr_max = shrinker_nr_max; - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); - } - up_write(&shrinker_rwsem); - - return ret; -} - -static int expand_shrinker_info(int new_id) -{ - int ret = 0; - int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; - struct mem_cgroup *memcg; - - if (!root_mem_cgroup) - goto out; - - lockdep_assert_held(&shrinker_rwsem); - - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); - - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size, - new_nr_max); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto out; - } - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); -out: - if (!ret) - shrinker_nr_max = new_nr_max; - - return ret; -} - -void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct shrinker_info *info; - - rcu_read_lock(); - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); - if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, info->map); - } - rcu_read_unlock(); - } -} - -static DEFINE_IDR(shrinker_idr); - -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - int id, ret = -ENOMEM; - - if (mem_cgroup_disabled()) - return -ENOSYS; - - down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); - if (id < 0) - goto unlock; - - if (id >= shrinker_nr_max) { - if (expand_shrinker_info(id)) { - idr_remove(&shrinker_idr, id); - goto unlock; - } - } - shrinker->id = id; - ret = 0; -unlock: - up_write(&shrinker_rwsem); - return ret; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ - int id = shrinker->id; - - BUG_ON(id < 0); - - lockdep_assert_held(&shrinker_rwsem); - - idr_remove(&shrinker_idr, id); -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); -} - -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); -} - -void reparent_shrinker_deferred(struct mem_cgroup *memcg) -{ - int i, nid; - long nr; - struct mem_cgroup *parent; - struct shrinker_info *child_info, *parent_info; - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); - for_each_node(nid) { - child_info = shrinker_info_protected(memcg, nid); - parent_info = shrinker_info_protected(parent, nid); - for (i = 0; i < child_info->map_nr_max; i++) { - nr = atomic_long_read(&child_info->nr_deferred[i]); - atomic_long_add(nr, &parent_info->nr_deferred[i]); - } - } - up_read(&shrinker_rwsem); -} /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) @@ -467,28 +241,14 @@ static bool writeback_throttling_sane(struct scan_control *sc) #endif return false; } -#else -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return -ENOSYS; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - return 0; -} -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) { - return 0; + if (sc->proactive && sc->proactive_swappiness) + return *sc->proactive_swappiness; + return mem_cgroup_swappiness(memcg); } - +#else static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -503,8 +263,32 @@ static bool writeback_throttling_sane(struct scan_control *sc) { return true; } + +static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) +{ + return READ_ONCE(vm_swappiness); +} #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -557,49 +341,22 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static long xchg_nr_deferred(struct shrinker *shrinker, - struct shrink_control *sc) -{ - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return xchg_nr_deferred_memcg(nid, shrinker, - sc->memcg); - - return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); -} - - -static long add_nr_deferred(long nr, struct shrinker *shrinker, - struct shrink_control *sc) +static bool can_demote(int nid, struct scan_control *sc, + struct mem_cgroup *memcg) { - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return add_nr_deferred_memcg(nr, nid, shrinker, - sc->memcg); - - return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); -} + int demotion_nid; -static bool can_demote(int nid, struct scan_control *sc) -{ if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; - if (next_demotion_node(nid) == NUMA_NO_NODE) + + demotion_nid = next_demotion_node(nid); + if (demotion_nid == NUMA_NO_NODE) return false; - return true; + /* If demotion node isn't in the cgroup's mems_allowed, fall back */ + return mem_cgroup_node_allowed(memcg, demotion_nid); } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, @@ -624,7 +381,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, * * Can it be reclaimed from this node via demotion? */ - return can_demote(nid, sc); + return can_demote(nid, sc, memcg); } /* @@ -656,13 +413,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, { unsigned long size = 0; int zid; + struct zone *zone; - for (zid = 0; zid <= zone_idx; zid++) { - struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) { if (!mem_cgroup_disabled()) size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else @@ -671,413 +424,6 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, return size; } -/* - * Add a shrinker callback to be called from the vm. - */ -static int __prealloc_shrinker(struct shrinker *shrinker) -{ - unsigned int size; - int err; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); - if (err != -ENOSYS) - return err; - - shrinker->flags &= ~SHRINKER_MEMCG_AWARE; - } - - size = sizeof(*shrinker->nr_deferred); - if (shrinker->flags & SHRINKER_NUMA_AWARE) - size *= nr_node_ids; - - shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); - if (!shrinker->nr_deferred) - return -ENOMEM; - - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __prealloc_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - - return err; -} -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif - -void free_prealloced_shrinker(struct shrinker *shrinker) -{ -#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; -#endif - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); - up_write(&shrinker_rwsem); - return; - } - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} - -void register_shrinker_prepared(struct shrinker *shrinker) -{ - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - shrinker->flags |= SHRINKER_REGISTERED; - shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); -} - -static int __register_shrinker(struct shrinker *shrinker) -{ - int err = __prealloc_shrinker(shrinker); - - if (err) - return err; - register_shrinker_prepared(shrinker); - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __register_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - return err; -} -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif -EXPORT_SYMBOL(register_shrinker); - -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) -{ - struct dentry *debugfs_entry; - int debugfs_id; - - if (!(shrinker->flags & SHRINKER_REGISTERED)) - return; - - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - shrinker->flags &= ~SHRINKER_REGISTERED; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - up_write(&shrinker_rwsem); - - shrinker_debugfs_remove(debugfs_entry, debugfs_id); - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} -EXPORT_SYMBOL(unregister_shrinker); - -/** - * synchronize_shrinkers - Wait for all running shrinkers to complete. - * - * This is equivalent to calling unregister_shrink() and register_shrinker(), - * but atomically and with less overhead. This is useful to guarantee that all - * shrinker invocations have seen an update, before freeing memory, similar to - * rcu. - */ -void synchronize_shrinkers(void) -{ - down_write(&shrinker_rwsem); - up_write(&shrinker_rwsem); -} -EXPORT_SYMBOL(synchronize_shrinkers); - -#define SHRINK_BATCH 128 - -static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, int priority) -{ - unsigned long freed = 0; - unsigned long long delta; - long total_scan; - long freeable; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; - long scanned = 0, next_deferred; - - freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0 || freeable == SHRINK_EMPTY) - return freeable; - - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = xchg_nr_deferred(shrinker, shrinkctl); - - if (shrinker->seeks) { - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); - } else { - /* - * These objects don't require any IO to create. Trim - * them aggressively under memory pressure to keep - * them from causing refetches in the IO caches. - */ - delta = freeable / 2; - } - - total_scan = nr >> priority; - total_scan += delta; - total_scan = min(total_scan, (2 * freeable)); - - trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); - - /* - * Normally, we should not scan less than batch_size objects in one - * pass to avoid too frequent shrinker calls, but if the slab has less - * than batch_size objects in total and we are really tight on memory, - * we will try to reclaim all available objects, otherwise we can end - * up failing allocations although there are plenty of reclaimable - * objects spread over several slabs with usage less than the - * batch_size. - * - * We detect the "tight on memory" situations by looking at the total - * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (freeable), we must be - * scanning at high prio and therefore should try to reclaim as much as - * possible. - */ - while (total_scan >= batch_size || - total_scan >= freeable) { - unsigned long ret; - unsigned long nr_to_scan = min(batch_size, total_scan); - - shrinkctl->nr_to_scan = nr_to_scan; - shrinkctl->nr_scanned = nr_to_scan; - ret = shrinker->scan_objects(shrinker, shrinkctl); - if (ret == SHRINK_STOP) - break; - freed += ret; - - count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); - total_scan -= shrinkctl->nr_scanned; - scanned += shrinkctl->nr_scanned; - - cond_resched(); - } - - /* - * The deferred work is increased by any new work (delta) that wasn't - * done, decreased by old deferred work that was done now. - * - * And it is capped to two times of the freeable items. - */ - next_deferred = max_t(long, (nr + delta - scanned), 0); - next_deferred = min(next_deferred, (2 * freeable)); - - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. - */ - new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); - return freed; -} - -#ifdef CONFIG_MEMCG -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - struct shrinker_info *info; - unsigned long ret, freed = 0; - int i; - - if (!mem_cgroup_online(memcg)) - return 0; - - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); - if (unlikely(!info)) - goto unlock; - - for_each_set_bit(i, info->map, info->map_nr_max) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - struct shrinker *shrinker; - - shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(i, info->map); - continue; - } - - /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) - continue; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) { - clear_bit(i, info->map); - /* - * After the shrinker reported that it had no objects to - * free, but before we cleared the corresponding bit in - * the memcg shrinker map, a new object might have been - * added. To make sure, we have the bit set in this - * case, we invoke the shrinker one more time and reset - * the bit if it reports that it is not empty anymore. - * The memory barrier here pairs with the barrier in - * set_shrinker_bit(): - * - * list_lru_add() shrink_slab_memcg() - * list_add_tail() clear_bit() - * <MB> <MB> - * set_bit() do_shrink_slab() - */ - smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - else - set_shrinker_bit(memcg, nid, i); - } - freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } -unlock: - up_read(&shrinker_rwsem); - return freed; -} -#else /* CONFIG_MEMCG */ -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - return 0; -} -#endif /* CONFIG_MEMCG */ - -/** - * shrink_slab - shrink slab caches - * @gfp_mask: allocation context - * @nid: node whose slab caches to target - * @memcg: memory cgroup whose slab caches to target - * @priority: the reclaim priority - * - * Call the shrink functions to age shrinkable caches. - * - * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, - * unaware shrinkers will receive a node id of 0 instead. - * - * @memcg specifies the memory cgroup to target. Unaware shrinkers - * are called only if it is the root cgroup. - * - * @priority is sc->priority, we take the number of objects and >> by priority - * in order to get the scan target. - * - * Returns the number of reclaimed slab objects. - */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, - int priority) -{ - unsigned long ret, freed = 0; - struct shrinker *shrinker; - - /* - * The root memcg might be allocated even though memcg is disabled - * via "cgroup_disable=memory" boot parameter. This could make - * mem_cgroup_is_root() return false, then just run memcg slab - * shrink, but skip global shrink. This may result in premature - * oom. - */ - if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) - return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } - - up_read(&shrinker_rwsem); -out: - cond_resched(); - return freed; -} - static unsigned long drop_slab_node(int nid) { unsigned long freed = 0; @@ -1108,35 +454,29 @@ void drop_slab(void) } while ((freed >> shift++) > 1); } -static int reclaimer_offset(void) +#define CHECK_RECLAIMER_OFFSET(type) \ + do { \ + BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ + PGDEMOTE_##type - PGDEMOTE_KSWAPD); \ + BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \ + PGSCAN_##type - PGSCAN_KSWAPD); \ + } while (0) + +static int reclaimer_offset(struct scan_control *sc) { - BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != - PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); - BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != - PGSCAN_DIRECT - PGSCAN_KSWAPD); - BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != - PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); - BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != - PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); + CHECK_RECLAIMER_OFFSET(DIRECT); + CHECK_RECLAIMER_OFFSET(KHUGEPAGED); + CHECK_RECLAIMER_OFFSET(PROACTIVE); if (current_is_kswapd()) return 0; if (current_is_khugepaged()) return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + if (sc->proactive) + return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD; return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } -static inline int is_page_cache_freeable(struct folio *folio) -{ - /* - * A freeable page cache folio is referenced only by the caller - * that isolated the folio, the page cache and optional filesystem - * private data at folio->private. - */ - return folio_ref_count(folio) - folio_test_private(folio) == - 1 + folio_nr_pages(folio); -} - /* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent @@ -1162,12 +502,12 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) { int reclaimable = 0, write_pending = 0; int i; - + struct zone *zone; /* * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; /* @@ -1175,12 +515,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) { reclaimable += zone_reclaimable_pages(zone); write_pending += zone_page_state_snapshot(zone, NR_ZONE_WRITE_PENDING); @@ -1298,78 +633,69 @@ typedef enum { PAGE_CLEAN, } pageout_t; +static pageout_t writeout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug, struct list_head *folio_list) +{ + int res; + + folio_set_reclaim(folio); + + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled + * or we failed to allocate contiguous swap entries, in which case + * the split out folios get added back to folio_list. + */ + if (shmem_mapping(mapping)) + res = shmem_writeout(folio, plug, folio_list); + else + res = swap_writeout(folio, plug); + + if (res < 0) + handle_write_error(mapping, folio, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + folio_clear_reclaim(folio); + return PAGE_ACTIVATE; + } + + /* synchronous write? */ + if (!folio_test_writeback(folio)) + folio_clear_reclaim(folio); + + trace_mm_vmscan_write_folio(folio); + node_stat_add_folio(folio, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; +} + /* * pageout is called by shrink_folio_list() for each dirty folio. - * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, - struct swap_iocb **plug) + struct swap_iocb **plug, struct list_head *folio_list) { /* - * If the folio is dirty, only perform writeback if that write - * will be non-blocking. To prevent this allocation from being - * stalled by pagecache activity. But note that there may be - * stalls if we need to run get_block(). We could test - * PagePrivate for that. - * - * If this process is currently in __generic_file_write_iter() against - * this folio's queue, we can perform writeback even if that - * will block. + * We no longer attempt to writeback filesystem folios here, other + * than tmpfs/shmem. That's taken care of in page-writeback. + * If we find a dirty filesystem folio at the end of the LRU list, + * typically that means the filesystem is saturating the storage + * with contiguous writes and telling it to write a folio here + * would only make the situation worse by injecting an element + * of random access. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. + * + * A freeable shmem or swapcache folio is referenced only by the + * caller that isolated the folio and the page cache. */ - if (!is_page_cache_freeable(folio)) + if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping) return PAGE_KEEP; - if (!mapping) { - /* - * Some data journaling orphaned folios can have - * folio->mapping == NULL while being dirty with clean buffers. - */ - if (folio_test_private(folio)) { - if (try_to_free_buffers(folio)) { - folio_clear_dirty(folio); - pr_info("%s: orphaned folio\n", __func__); - return PAGE_CLEAN; - } - } - return PAGE_KEEP; - } - if (mapping->a_ops->writepage == NULL) + if (!shmem_mapping(mapping) && !folio_test_anon(folio)) return PAGE_ACTIVATE; - - if (folio_clear_dirty_for_io(folio)) { - int res; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .nr_to_write = SWAP_CLUSTER_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - .for_reclaim = 1, - .swap_plug = plug, - }; - - folio_set_reclaim(folio); - res = mapping->a_ops->writepage(&folio->page, &wbc); - if (res < 0) - handle_write_error(mapping, folio, res); - if (res == AOP_WRITEPAGE_ACTIVATE) { - folio_clear_reclaim(folio); - return PAGE_ACTIVATE; - } - - if (!folio_test_writeback(folio)) { - /* synchronous write or broken a_ops? */ - folio_clear_reclaim(folio); - } - trace_mm_vmscan_write_folio(folio); - node_stat_add_folio(folio, NR_VMSCAN_WRITE); - return PAGE_SUCCESS; - } - - return PAGE_CLEAN; + if (!folio_clear_dirty_for_io(folio)) + return PAGE_CLEAN; + return writeout(folio, mapping, plug, folio_list); } /* @@ -1381,13 +707,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, { int refcount; void *shadow = NULL; + struct swap_cluster_info *ci; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + ci = swap_cluster_get_and_lock_irq(folio); + } else { spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + xa_lock_irq(&mapping->i_pages); + } + /* * The non racy check for a busy folio. * @@ -1423,13 +754,13 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, } if (folio_test_swapcache(folio)) { - swp_entry_t swap = folio_swap_entry(folio); + swp_entry_t swap = folio->swap; if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __delete_from_swap_cache(folio, swap, shadow); - mem_cgroup_swapout(folio, swap); - xa_unlock_irq(&mapping->i_pages); + __swap_cache_del_folio(ci, folio, swap, shadow); + memcg1_swapout(folio, swap); + swap_cluster_unlock_irq(ci); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -1457,7 +788,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, __filemap_remove_folio(folio, shadow); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); if (free_folio) @@ -1467,9 +798,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, return 1; cannot_free: - xa_unlock_irq(&mapping->i_pages); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + swap_cluster_unlock_irq(ci); + } else { + xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); + } return 0; } @@ -1521,15 +855,39 @@ enum folio_references { FOLIOREF_ACTIVATE, }; +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); + return false; + } + + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; +} +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; - unsigned long vm_flags; + vm_flags_t vm_flags; referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); - referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. @@ -1538,10 +896,24 @@ static enum folio_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE; - /* rmap lock contention: rotate */ + /* + * There are two cases to consider. + * 1) Rmap lock contention: rotate. + * 2) Skip the non-shared swapbacked folio mapped solely by + * the exiting or OOM-reaped process. + */ if (referenced_ptes == -1) return FOLIOREF_KEEP; + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1659,10 +1031,11 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, * When this happens, 'page' will likely just be discarded * instead of migrated. */ - .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, - .nmask = &allowed_mask + .nmask = &allowed_mask, + .reason = MR_DEMOTION, }; if (list_empty(demote_folios)) @@ -1678,8 +1051,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); - return nr_succeeded; } @@ -1704,19 +1075,21 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, - struct reclaim_stat *stat, bool ignore_references) + struct reclaim_stat *stat, bool ignore_references, + struct mem_cgroup *memcg) { + struct folio_batch free_folios; LIST_HEAD(ret_folios); - LIST_HEAD(free_folios); LIST_HEAD(demote_folios); - unsigned int nr_reclaimed = 0; + unsigned int nr_reclaimed = 0, nr_demoted = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; + folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); - do_demote_pass = can_demote(pgdat->node_id, sc); + do_demote_pass = can_demote(pgdat->node_id, sc, memcg); retry: while (!list_empty(folio_list)) { @@ -1734,6 +1107,21 @@ retry: if (!folio_trylock(folio)) goto keep; + if (folio_contain_hwpoisoned_page(folio)) { + /* + * unmap_poisoned_folio() can't handle large + * folio, just skip it. memory_failure() will + * handle it if the UCE is triggered again. + */ + if (folio_test_large(folio)) + goto keep_locked; + + unmap_poisoned_folio(folio, folio_pfn(folio), false); + folio_unlock(folio); + folio_put(folio); + continue; + } + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); nr_pages = folio_nr_pages(folio); @@ -1747,11 +1135,6 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - /* folio_update_gen() tried to promote this page? */ - if (lru_gen_enabled() && !ignore_references && - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1792,8 +1175,10 @@ retry: * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the folio for immediate - * reclaim and continue scanning. + * not to fs), or the folio belongs to a mapping where + * waiting on writeback during reclaim may lead to a deadlock. + * In this case mark the folio for immediate reclaim and + * continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might @@ -1818,6 +1203,8 @@ retry: * takes to write them to disk. */ if (folio_test_writeback(folio)) { + mapping = folio_mapping(folio); + /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && @@ -1828,7 +1215,9 @@ retry: /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || - !may_enter_fs(folio, sc->gfp_mask)) { + !may_enter_fs(folio, sc->gfp_mask) || + (mapping && + mapping_writeback_may_deadlock_on_reclaim(mapping))) { /* * This is slightly racy - * folio_end_writeback() might have @@ -1895,37 +1284,49 @@ retry: goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ - if (!can_split_folio(folio, NULL)) + if (!can_split_folio(folio, 1, NULL)) goto activate_locked; /* - * Split folios without a PMD map right - * away. Chances are some or all of the - * tail pages can be freed without IO. + * Split partially mapped folios right away. + * We can free the unmapped pages without IO. */ - if (!folio_entire_mapcount(folio) && - split_folio_to_list(folio, - folio_list)) + if (data_race(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) && + split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (!add_to_swap(folio)) { + if (folio_alloc_swap(folio)) { + int __maybe_unused order = folio_order(folio); + if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ - if (split_folio_to_list(folio, - folio_list)) + if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - count_vm_event(THP_SWPOUT_FALLBACK); + if (nr_pages >= HPAGE_PMD_NR) { + count_memcg_folio_events(folio, + THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } #endif - if (!add_to_swap(folio)) + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); + if (folio_alloc_swap(folio)) goto activate_locked_split; } + /* + * Normally the folio will be dirtied in unmap because its + * pte should be dirty. A special case is MADV_FREE page. The + * page's pte could have dirty bit cleared but the folio's + * SwapBacked flag is still set because clearing the dirty bit + * and SwapBacked flag has no lock protected. For such folio, + * unmap will not set dirty bit for it, so folio reclaim will + * not write the folio out. This can cause data corruption when + * the folio is swapped in later. Always setting the dirty flag + * for the folio solves the problem. + */ + folio_mark_dirty(folio); } - } else if (folio_test_swapbacked(folio) && - folio_test_large(folio)) { - /* Split shmem folio */ - if (split_folio_to_list(folio, folio_list)) - goto keep_locked; } /* @@ -1948,6 +1349,20 @@ retry: if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; + /* + * Without TTU_SYNC, try_to_unmap will only begin to + * hold PTL from the first present PTE within a large + * folio. Some initial PTEs might be skipped due to + * races with parallel PTE writes in which PTEs can be + * cleared temporarily before being written new present + * values. This will lead to a large folio is still + * mapped while some subpages have been partially + * unmapped after try_to_unmap; TTU_SYNC helps + * try_to_unmap acquire PTL from the first PTE, + * eliminating the influence of temporary PTE values. + */ + if (folio_test_large(folio)) + flags |= TTU_SYNC; try_to_unmap(folio, flags); if (folio_mapped(folio)) { @@ -1971,21 +1386,7 @@ retry: mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { - /* - * Only kswapd can writeback filesystem folios - * to avoid risk of stack overflow. But avoid - * injecting inefficient single-folio I/O into - * flusher writeback as much as possible: only - * write folios when we've encountered many - * dirty folios, and when we've already scanned - * the rest of the LRU for clean folios and see - * the same dirty folios again (with the reclaim - * flag set). - */ - if (folio_is_file_lru(folio) && - (!current_is_kswapd() || - !folio_test_reclaim(folio) || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() @@ -1994,7 +1395,8 @@ retry: */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); - folio_set_reclaim(folio); + if (!folio_test_reclaim(folio)) + folio_set_reclaim(folio); goto activate_locked; } @@ -2012,12 +1414,25 @@ retry: * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping, &plug)) { + switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: + /* + * If shmem folio is split when writeback to swap, + * the tail pages will make their own pass through + * this function and be accounted then. + */ + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } goto activate_locked; case PAGE_SUCCESS: + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } stat->nr_pageout += nr_pages; if (folio_test_writeback(folio)) @@ -2064,7 +1479,7 @@ retry: * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable. */ - if (folio_has_private(folio)) { + if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) { @@ -2111,14 +1526,12 @@ free_it: */ nr_reclaimed += nr_pages; - /* - * Is there need to periodically free_folio_list? It would - * appear not as the counts should be low - */ - if (unlikely(folio_test_large(folio))) - destroy_large_folio(folio); - else - list_add(&folio->lru, &free_folios); + folio_unqueue_deferred_split(folio); + if (folio_batch_add(&free_folios, folio) == 0) { + mem_cgroup_uncharge_folios(&free_folios); + try_to_unmap_flush(); + free_unref_folios(&free_folios); + } continue; activate_locked_split: @@ -2152,7 +1565,9 @@ keep: /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_reclaimed += nr_demoted; + stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ @@ -2182,9 +1597,9 @@ keep: pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_folios); + mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_folios); + free_unref_folios(&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); @@ -2208,9 +1623,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, unsigned int noreclaim_flag; list_for_each_entry_safe(folio, next, folio_list, lru) { + /* TODO: these pages should not even appear in this list. */ + if (page_has_movable_ops(&folio->page)) + continue; if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && - !folio_test_dirty(folio) && !__folio_test_movable(folio) && - !folio_test_unevictable(folio)) { + !folio_test_dirty(folio) && !folio_test_unevictable(folio)) { folio_clear_active(folio); list_move(&folio->lru, &clean_folios); } @@ -2224,7 +1641,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, - &stat, true); + &stat, true, NULL); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_folios, folio_list); @@ -2261,25 +1678,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -#ifdef CONFIG_CMA -/* - * It is waste of effort to scan and reclaim CMA pages if it is not available - * for current allocation context. Kswapd can not be enrolled as it can not - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL - */ -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return !current_is_kswapd() && - gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && - get_pageblock_migratetype(&folio->page) == MIGRATE_CMA; -} -#else -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return false; -} -#endif - /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * @@ -2310,12 +1708,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0; - unsigned long scan, total_scan, nr_pages; + unsigned long skipped = 0, total_scan = 0, scan = 0; + unsigned long nr_pages; + unsigned long max_nr_skipped = 0; LIST_HEAD(folios_skipped); - total_scan = 0; - scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; struct folio *folio; @@ -2326,10 +1723,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx || - skip_cma(folio, sc)) { + /* Using max_nr_skipped to prevent hard LOCKUP*/ + if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED && + (folio_zonenum(folio) > sc->reclaim_idx)) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; + max_nr_skipped++; goto move; } @@ -2389,8 +1788,7 @@ move: } *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, - total_scan, skipped, nr_taken, - sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); + total_scan, skipped, nr_taken, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -2444,17 +1842,17 @@ bool folio_isolate_lru(struct folio *folio) * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct pglist_data *pgdat, int file, +static bool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; bool too_many; if (current_is_kswapd()) - return 0; + return false; if (!writeback_throttling_sane(sc)) - return 0; + return false; if (file) { inactive = node_page_state(pgdat, NR_INACTIVE_FILE); @@ -2483,7 +1881,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. - * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ @@ -2491,8 +1888,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; - LIST_HEAD(folios_to_free); + struct folio_batch free_folios; + folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); @@ -2521,12 +1919,13 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - if (unlikely(folio_test_large(folio))) { + folio_unqueue_deferred_split(folio); + if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); - destroy_large_folio(folio); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); - } else - list_add(&folio->lru, &folios_to_free); + } continue; } @@ -2543,10 +1942,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, workingset_age_nonresident(lruvec, nr_pages); } - /* - * To save our caller's stack, now use input list for pages to free. - */ - list_splice(&folios_to_free, list); + if (free_folios.nr) { + spin_unlock_irq(&lruvec->lru_lock); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); + spin_lock_irq(&lruvec->lru_lock); + } return nr_moved; } @@ -2600,10 +2001,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = PGSCAN_KSWAPD + reclaimer_offset(); + item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2611,22 +2012,23 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false, + lruvec_memcg(lruvec)); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = PGSTEAL_KSWAPD + reclaimer_offset(); + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); - spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); - mem_cgroup_uncharge_list(&folio_list); - free_unref_page_list(&folio_list); + lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout, + nr_scanned - nr_reclaimed); /* * If dirty folios are scanned that are not queued for IO, it @@ -2692,13 +2094,13 @@ static void shrink_active_list(unsigned long nr_to_scan, { unsigned long nr_taken; unsigned long nr_scanned; - unsigned long vm_flags; + vm_flags_t vm_flags; LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); @@ -2712,7 +2114,7 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2729,9 +2131,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (unlikely(buffer_heads_over_limit)) { - if (folio_test_private(folio) && folio_trylock(folio)) { - if (folio_test_private(folio)) - filemap_release_folio(folio, 0); + if (folio_needs_release(folio) && + folio_trylock(folio)) { + filemap_release_folio(folio, 0); folio_unlock(folio); } } @@ -2767,19 +2169,13 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); - /* Keep all free folios in l_active list */ - list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&lruvec->lru_lock); - if (nr_rotated) - lru_note_cost(lruvec, file, 0, nr_rotated); - mem_cgroup_uncharge_list(&l_active); - free_unref_page_list(&l_active); + lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2787,7 +2183,7 @@ static void shrink_active_list(unsigned long nr_to_scan, static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { - struct reclaim_stat dummy_stat; + struct reclaim_stat stat; unsigned int nr_reclaimed; struct folio *folio; struct scan_control sc = { @@ -2798,12 +2194,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } + trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat); return nr_reclaimed; } @@ -2909,7 +2306,7 @@ enum scan_balance { SCAN_FILE, }; -static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; struct lruvec *target_lruvec; @@ -2920,10 +2317,11 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); /* - * Flush the memory cgroup stats, so that we read accurate per-memcg - * lruvec stats for heuristics. + * Flush the memory cgroup stats in rate-limited way as we don't need + * most accurate stats here. We may switch to regular stats flushing + * in the future once it is cheap enough. */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. @@ -2969,7 +2367,8 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) * anonymous pages. */ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); - if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) && + !sc->no_cache_trim_mode) sc->cache_trim_mode = 1; else sc->cache_trim_mode = 0; @@ -2987,17 +2386,13 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) unsigned long total_high_wmark = 0; unsigned long free, anon; int z; + struct zone *zone; free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); file = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_FILE); - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { total_high_wmark += high_wmark_pages(zone); } @@ -3015,6 +2410,106 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) } } +static inline void calculate_pressure_balance(struct scan_control *sc, + int swappiness, u64 *fraction, u64 *denominator) +{ + unsigned long anon_cost, file_cost, total_cost; + unsigned long ap, fp; + + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[WORKINGSET_ANON] = ap; + fraction[WORKINGSET_FILE] = fp; + *denominator = ap + fp; +} + +static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long scan) +{ + unsigned long min, low; + + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + + if (min || low) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan -= scan * protection / (cgroup_size + 1); + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decrementing + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } + return scan; +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. @@ -3027,12 +2522,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, { struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); - unsigned long anon_cost, file_cost, total_cost; - int swappiness = mem_cgroup_swappiness(memcg); + int swappiness = sc_swappiness(sc, memcg); u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; - unsigned long ap, fp; enum lru_list lru; /* If we have no swap space, do not bother scanning anon folios. */ @@ -3053,6 +2546,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, goto out; } + /* Proactive reclaim initiated by userspace for anonymous memory only */ + if (swappiness == SWAPPINESS_ANON_ONLY) { + WARN_ON_ONCE(!sc->proactive); + scan_balance = SCAN_ANON; + goto out; + } + /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally @@ -3073,7 +2573,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, /* * If there is enough inactive page cache, we do not reclaim - * anything from the anonymous working right now. + * anything from the anonymous working right now to make sure + * a streaming file access pattern doesn't cause swapping. */ if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; @@ -3081,103 +2582,16 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } scan_balance = SCAN_FRACT; - /* - * Calculate the pressure balance between anon and file pages. - * - * The amount of pressure we put on each LRU is inversely - * proportional to the cost of reclaiming each list, as - * determined by the share of pages that are refaulting, times - * the relative IO cost of bringing back a swapped out - * anonymous page vs reloading a filesystem page (swappiness). - * - * Although we limit that influence to ensure no list gets - * left behind completely: at least a third of the pressure is - * applied, before swappiness. - * - * With swappiness at 100, anon and file have equal IO cost. - */ - total_cost = sc->anon_cost + sc->file_cost; - anon_cost = total_cost + sc->anon_cost; - file_cost = total_cost + sc->file_cost; - total_cost = anon_cost + file_cost; - - ap = swappiness * (total_cost + 1); - ap /= anon_cost + 1; + calculate_pressure_balance(sc, swappiness, fraction, &denominator); - fp = (200 - swappiness) * (total_cost + 1); - fp /= file_cost + 1; - - fraction[0] = ap; - fraction[1] = fp; - denominator = ap + fp; out: for_each_evictable_lru(lru) { - int file = is_file_lru(lru); + bool file = is_file_lru(lru); unsigned long lruvec_size; - unsigned long low, min; unsigned long scan; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - mem_cgroup_protection(sc->target_mem_cgroup, memcg, - &min, &low); - - if (min || low) { - /* - * Scale a cgroup's reclaim pressure by proportioning - * its current usage to its memory.low or memory.min - * setting. - * - * This is important, as otherwise scanning aggression - * becomes extremely binary -- from nothing as we - * approach the memory protection threshold, to totally - * nominal as we exceed it. This results in requiring - * setting extremely liberal protection thresholds. It - * also means we simply get no protection at all if we - * set it too low, which is not ideal. - * - * If there is any protection in place, we reduce scan - * pressure by how much of the total memory used is - * within protection thresholds. - * - * There is one special case: in the first reclaim pass, - * we skip over all groups that are within their low - * protection. If that fails to reclaim enough pages to - * satisfy the reclaim goal, we come back and override - * the best-effort low protection. However, we still - * ideally want to honor how well-behaved groups are in - * that case instead of simply punishing them all - * equally. As such, we reclaim them based on how much - * memory they are using, reducing the scan pressure - * again by how much of the total memory used is under - * hard protection. - */ - unsigned long cgroup_size = mem_cgroup_size(memcg); - unsigned long protection; - - /* memory.low scaling, make sure we retry before OOM */ - if (!sc->memcg_low_reclaim && low > min) { - protection = low; - sc->memcg_low_skipped = 1; - } else { - protection = min; - } - - /* Avoid TOCTOU with earlier protection check */ - cgroup_size = max(cgroup_size, protection); - - scan = lruvec_size - lruvec_size * protection / - (cgroup_size + 1); - - /* - * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decrementing - * sc->priority further than desirable. - */ - scan = max(scan, SWAP_CLUSTER_MAX); - } else { - scan = lruvec_size; - } - + scan = apply_proportional_protection(memcg, sc, lruvec_size); scan >>= sc->priority; /* @@ -3223,7 +2637,7 @@ out: * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ -static bool can_age_anon_pages(struct pglist_data *pgdat, +static bool can_age_anon_pages(struct lruvec *lruvec, struct scan_control *sc) { /* Aging the anon LRU is valuable if swap is present: */ @@ -3231,7 +2645,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return true; /* Also valuable if anon pages can be demoted: */ - return can_demote(pgdat->node_id, sc); + return can_demote(lruvec_pgdat(lruvec)->node_id, sc, + lruvec_memcg(lruvec)); } #ifdef CONFIG_LRU_GEN @@ -3258,8 +2673,6 @@ static bool should_clear_pmd_young(void) * shorthand helpers ******************************************************************************/ -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define DEFINE_MAX_SEQ(lruvec) \ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) @@ -3269,11 +2682,21 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +/* Get the min/max evictable type based on swappiness */ +#define min_type(swappiness) (!(swappiness)) +#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY) + +#define evictable_min_seq(min_seq, swappiness) \ + min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)]) + #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define for_each_evictable_type(type, swappiness) \ + for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++) + #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -3305,11 +2728,11 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) if (!sc->may_swap) return 0; - if (!can_demote(pgdat->node_id, sc) && + if (!can_demote(pgdat->node_id, sc, memcg) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; - return mem_cgroup_swappiness(memcg); + return sc_swappiness(sc, memcg); } static int get_nr_gens(struct lruvec *lruvec, int type) @@ -3319,10 +2742,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_folio */ - return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && - get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + int n = get_nr_gens(lruvec, type); + + if (n < MIN_NR_GENS || n > MAX_NR_GENS) + return false; + } + + return true; } /****************************************************************************** @@ -3368,13 +2797,14 @@ static void get_item_key(void *item, int *key) key[1] = hash >> BLOOM_FILTER_SHIFT; } -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return true; @@ -3383,13 +2813,14 @@ static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *it return test_bit(key[0], filter) && test_bit(key[1], filter); } -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return; @@ -3401,12 +2832,12 @@ static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void * set_bit(key[1], filter); } -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) { unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = lruvec->mm_state.filters[gen]; + filter = mm_state->filters[gen]; if (filter) { bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); return; @@ -3414,13 +2845,15 @@ static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); + WRITE_ONCE(mm_state->filters[gen], filter); } /****************************************************************************** * mm_struct list ******************************************************************************/ +#ifdef CONFIG_LRU_GEN_WALKS_MMU + static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { static struct lru_gen_mm_list mm_list = { @@ -3437,6 +2870,29 @@ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) return &mm_list; } +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return &lruvec->mm_state; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + int key; + struct mm_struct *mm; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); + + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return NULL; + + clear_bit(key, &mm->lru_gen.bitmap); + + return mmget_not_zero(mm) ? mm : NULL; +} + void lru_gen_add_mm(struct mm_struct *mm) { int nid; @@ -3452,10 +2908,11 @@ void lru_gen_add_mm(struct mm_struct *mm) for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* the first addition since the last iteration */ - if (lruvec->mm_state.tail == &mm_list->fifo) - lruvec->mm_state.tail = &mm->lru_gen.list; + if (mm_state->tail == &mm_list->fifo) + mm_state->tail = &mm->lru_gen.list; } list_add_tail(&mm->lru_gen.list, &mm_list->fifo); @@ -3481,14 +2938,15 @@ void lru_gen_del_mm(struct mm_struct *mm) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* where the current iteration continues after */ - if (lruvec->mm_state.head == &mm->lru_gen.list) - lruvec->mm_state.head = lruvec->mm_state.head->prev; + if (mm_state->head == &mm->lru_gen.list) + mm_state->head = mm_state->head->prev; /* where the last iteration ended before */ - if (lruvec->mm_state.tail == &mm->lru_gen.list) - lruvec->mm_state.tail = lruvec->mm_state.tail->next; + if (mm_state->tail == &mm->lru_gen.list) + mm_state->tail = mm_state->tail->next; } list_del_init(&mm->lru_gen.list); @@ -3531,64 +2989,59 @@ void lru_gen_migrate_mm(struct mm_struct *mm) } #endif -static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + return NULL; +} + +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return NULL; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + return NULL; +} + +#endif + +static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) { int i; int hist; + struct lruvec *lruvec = walk->lruvec; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); - if (walk) { - hist = lru_hist_from_seq(walk->max_seq); + hist = lru_hist_from_seq(walk->seq); - for (i = 0; i < NR_MM_STATS; i++) { - WRITE_ONCE(lruvec->mm_state.stats[hist][i], - lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); - walk->mm_stats[i] = 0; - } + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(mm_state->stats[hist][i], + mm_state->stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + hist = lru_hist_from_seq(walk->seq + 1); for (i = 0; i < NR_MM_STATS; i++) - WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); - } -} - -static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) -{ - int type; - unsigned long size = 0; - struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); - - if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) - return true; - - clear_bit(key, &mm->lru_gen.bitmap); - - for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { - size += type ? get_mm_counter(mm, MM_FILEPAGES) : - get_mm_counter(mm, MM_ANONPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + WRITE_ONCE(mm_state->stats[hist][i], 0); } - - if (size < MIN_LRU_BATCH) - return true; - - return !mmget_not_zero(mm); } -static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, - struct mm_struct **iter) +static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter) { bool first = false; bool last = false; struct mm_struct *mm = NULL; + struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* * mm_state->seq is incremented after each iteration of mm_list. There @@ -3602,9 +3055,9 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, */ spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); - if (walk->max_seq <= mm_state->seq) + if (walk->seq <= mm_state->seq) goto done; if (!mm_state->head) @@ -3626,19 +3079,15 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, mm_state->tail = mm_state->head->next; walk->force_scan = true; } - - mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); - if (should_skip_mm(mm, walk)) - mm = NULL; - } while (!mm); + } while (!(mm = get_next_mm(walk))); done: if (*iter || last) - reset_mm_stats(lruvec, walk, last); + reset_mm_stats(walk, last); spin_unlock(&mm_list->lock); if (mm && first) - reset_bloom_filter(lruvec, walk->max_seq + 1); + reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) mmput_async(*iter); @@ -3648,22 +3097,21 @@ done: return last; } -static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) { bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); - if (max_seq > mm_state->seq) { + if (seq > mm_state->seq) { mm_state->head = NULL; mm_state->tail = NULL; WRITE_ONCE(mm_state->seq, mm_state->seq + 1); - reset_mm_stats(lruvec, NULL, true); success = true; } @@ -3704,16 +3152,20 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { + int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); - pos->refaulted = lrugen->avg_refaulted[type][tier] + - atomic_long_read(&lrugen->refaulted[hist][type][tier]); - pos->total = lrugen->avg_total[type][tier] + - atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - pos->total += lrugen->protected[hist][type][tier - 1]; pos->gain = gain; + pos->refaulted = pos->total = 0; + + for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { + pos->refaulted += lrugen->avg_refaulted[type][i] + + atomic_long_read(&lrugen->refaulted[hist][type][i]); + pos->total += lrugen->avg_total[type][i] + + lrugen->protected[hist][type][i] + + atomic_long_read(&lrugen->evicted[hist][type][i]); + } } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) @@ -3739,17 +3191,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - sum += lrugen->protected[hist][type][tier - 1]; WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); - if (tier) - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + WRITE_ONCE(lrugen->protected[hist][type][tier], 0); } } } @@ -3772,22 +3222,24 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); - VM_WARN_ON_ONCE(!rcu_read_lock_held()); + + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } do { /* lru_gen_del_folio() has isolated this page? */ - if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_folio_list() */ - new_flags = old_flags | BIT(PG_referenced); - continue; - } + if (!(old_flags & LRU_GEN_MASK)) + return -1; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -3798,7 +3250,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); @@ -3810,12 +3262,12 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_gen = (old_gen + 1) % MAX_NR_GENS; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -3838,9 +3290,10 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, walk->nr_pages[new_gen][type][zone] += delta; } -static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) +static void reset_batch_size(struct lru_gen_mm_walk *walk) { int gen, type, zone; + struct lruvec *lruvec = walk->lruvec; struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; @@ -3884,7 +3337,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (vma_is_anonymous(vma)) - return !walk->can_swap; + return !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; @@ -3894,7 +3347,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (shmem_mapping(mapping)) - return !walk->can_swap; + return !walk->swappiness; + + if (walk->swappiness > MAX_SWAPPINESS) + return true; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; @@ -3931,7 +3387,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk return false; } -static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pte_pfn(pte); @@ -3940,17 +3397,23 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (!pte_present(pte) || is_zero_pfn(pfn)) return -1; - if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + if (WARN_ON_ONCE(pte_special(pte))) + return -1; + + if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) return -1; if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pmd_pfn(pmd); @@ -3959,34 +3422,30 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) return -1; - if (WARN_ON_ONCE(pmd_devmap(pmd))) + if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) return -1; if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } -#endif static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - struct pglist_data *pgdat, bool can_swap) + struct pglist_data *pgdat) { - struct folio *folio; + struct folio *folio = pfn_folio(pfn); - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + if (folio_lru_gen(folio) < 0) return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; - if (folio_memcg_rcu(folio) != memcg) - return NULL; - - /* file VMAs can contain anon pages from COW */ - if (!folio_is_file_lru(folio) && !can_swap) + if (folio_memcg(folio) != memcg) return NULL; return folio; @@ -4000,25 +3459,59 @@ static bool suitable_to_scan(int total, int young) return young * n >= total; } +static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, + int new_gen, bool dirty) +{ + int old_gen; + + if (!folio) + return; + + if (dirty && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); + } +} + static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; + bool dirty; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + DEFINE_MAX_SEQ(walk->lruvec); + int gen = lru_gen_from_seq(max_seq); + pmd_t pmdval; - pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); if (!pte) return false; + if (!spin_trylock(ptl)) { pte_unmap(pte); + return true; + } + + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + pte_unmap_unlock(pte, ptl); return false; } @@ -4032,35 +3525,34 @@ restart: total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(ptent, args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) { - walk->mm_stats[MM_LEAF_OLD]++; + folio = get_pfn_folio(pfn, memcg, pgdat); + if (!folio) continue; - } - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); - if (!folio) + if (!ptep_clear_young_notify(args->vma, addr, pte + i)) continue; - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; + last = folio; + dirty = false; + } - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + if (pte_dirty(ptent)) + dirty = true; - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; } + walk_update_folio(walk, last, gen, dirty); + last = NULL; + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; @@ -4070,17 +3562,19 @@ restart: return suitable_to_scan(total, young); } -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; + bool dirty; pmd_t *pmd; spinlock_t *ptl; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + DEFINE_MAX_SEQ(walk->lruvec); + int gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -4112,48 +3606,49 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area /* don't round down the first address */ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; - pfn = get_pmd_pfn(pmd[i], vma, addr); - if (pfn == -1) + if (!pmd_present(pmd[i])) goto next; if (!pmd_trans_huge(pmd[i])) { - if (should_clear_pmd_young()) + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); + if (pfn == -1) + goto next; + + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) goto next; - if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; - walk->mm_stats[MM_LEAF_YOUNG]++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } - if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + if (pmd_dirty(pmd[i])) + dirty = true; - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + walk->mm_stats[MM_LEAF_YOUNG]++; next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: *first = -1; } -#else -static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *first) -{ -} -#endif static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, struct mm_walk *args) @@ -4166,6 +3661,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -4188,36 +3684,26 @@ restart: continue; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(val)) { - unsigned long pfn = pmd_pfn(val); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); walk->mm_stats[MM_LEAF_TOTAL]++; - if (!pmd_young(val)) { - walk->mm_stats[MM_LEAF_OLD]++; - continue; - } - - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - continue; - - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + if (pfn != -1) + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } -#endif - walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (should_clear_pmd_young()) { + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) { if (!pmd_young(val)) continue; walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } - if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; @@ -4228,7 +3714,7 @@ restart: walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ - update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + update_bloom_filter(mm_state, walk->seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); @@ -4251,7 +3737,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, pud = pud_offset(p4d, start & P4D_MASK); restart: for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { - pud_t val = READ_ONCE(pud[i]); + pud_t val = pudp_get(pud + i); next = pud_addr_end(addr, end); @@ -4279,15 +3765,15 @@ done: return -EAGAIN; } -static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) +static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) { static const struct mm_walk_ops mm_walk_ops = { .test_walk = should_skip_vma, .p4d_entry = walk_pud_range, + .walk_lock = PGWALK_RDLOCK, }; - int err; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lruvec *lruvec = walk->lruvec; walk->next_addr = FIRST_USER_ADDRESS; @@ -4297,11 +3783,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ err = -EBUSY; /* another thread might have called inc_max_seq() */ - if (walk->max_seq != max_seq) - break; - - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) + if (walk->seq != max_seq) break; /* the caller might be holding the lock for write */ @@ -4311,11 +3793,9 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ mmap_read_unlock(mm); } - mem_cgroup_unlock_pages(); - if (walk->batched) { spin_lock_irq(&lruvec->lru_lock); - reset_batch_size(lruvec, walk); + reset_batch_size(walk); spin_unlock_irq(&lruvec->lru_lock); } @@ -4355,22 +3835,30 @@ static void clear_mm_walk(void) kfree(walk); } -static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type == LRU_GEN_ANON && !can_swap) + /* For file type, skip the check if swappiness is anon only */ + if (type && (swappiness == SWAPPINESS_ANON_ONLY)) + goto done; + + /* For anon type, skip the check if swappiness is zero (file only) */ + if (!type && !swappiness) goto done; - /* prevent cold/hot inversion if force_scan is true */ + /* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -4380,6 +3868,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + if (!--remaining) return false; } @@ -4391,17 +3888,18 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; + bool seq_inc_flag = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); @@ -4411,19 +3909,32 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) } min_seq[type]++; + seq_inc_flag = true; } next: ; } + /* + * If min_seq[type] of both anonymous and file is not increased, + * we can directly return false to avoid unnecessary checking + * overhead later. + */ + if (!seq_inc_flag) + return success; + /* see the comment on lru_gen_folio */ - if (can_swap) { - min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); - min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + if (swappiness && swappiness <= MAX_SWAPPINESS) { + unsigned long seq = lrugen->max_seq - MIN_NR_GENS; + + if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) + min_seq[LRU_GEN_ANON] = seq; + else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) + min_seq[LRU_GEN_FILE] = seq; } - for (type = !can_swap; type < ANON_AND_FILE; type++) { - if (min_seq[type] == lrugen->min_seq[type]) + for_each_evictable_type(type, swappiness) { + if (min_seq[type] <= lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); @@ -4434,27 +3945,34 @@ next: return success; } -static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) { + bool success; int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; +restart: + if (seq < READ_ONCE(lrugen->max_seq)) + return false; spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); - for (type = ANON_AND_FILE - 1; type >= 0; type--) { + success = seq == lrugen->max_seq; + if (!success) + goto unlock; + + for (type = 0; type < ANON_AND_FILE; type++) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; - VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); + if (inc_min_seq(lruvec, type, swappiness)) + continue; - while (!inc_min_seq(lruvec, type, can_swap)) { - spin_unlock_irq(&lruvec->lru_lock); - cond_resched(); - spin_lock_irq(&lruvec->lru_lock); - } + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + goto restart; } /* @@ -4486,25 +4004,29 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) WRITE_ONCE(lrugen->timestamps[next], jiffies); /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); - +unlock: spin_unlock_irq(&lruvec->lru_lock); + + return success; } -static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, bool force_scan) +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, + int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); + + if (!mm_state) + return inc_max_seq(lruvec, seq, swappiness); /* see the comment in iterate_mm_list() */ - if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { - success = false; - goto done; - } + if (seq <= READ_ONCE(mm_state->seq)) + return false; /* * If the hardware doesn't automatically set the accessed bit, fallback @@ -4513,29 +4035,31 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * is less efficient, but it avoids bursty page faults. */ if (!should_walk_mmu()) { - success = iterate_mm_list_nowalk(lruvec, max_seq); + success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk = set_mm_walk(NULL, true); if (!walk) { - success = iterate_mm_list_nowalk(lruvec, max_seq); + success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk->lruvec = lruvec; - walk->max_seq = max_seq; - walk->can_swap = can_swap; + walk->seq = seq; + walk->swappiness = swappiness; walk->force_scan = force_scan; do { - success = iterate_mm_list(lruvec, walk, &mm); + success = iterate_mm_list(walk, &mm); if (mm) - walk_mm(lruvec, mm, walk); + walk_mm(mm, walk); } while (mm); done: - if (success) - inc_max_seq(lruvec, can_swap, force_scan); + if (success) { + success = inc_max_seq(lruvec, seq, swappiness); + WARN_ON_ONCE(!success); + } return success; } @@ -4544,17 +4068,43 @@ done: * working set protection ******************************************************************************/ +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ + int priority; + unsigned long reclaimable; + + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) + return; + /* + * Determine the initial priority based on + * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, + * where reclaimed_to_scanned_ratio = inactive / total. + */ + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + + /* + * The estimation is based on LRU pages only, so cap it to prevent + * overshoots of shrinker objects by large margins. + */ + sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY); +} + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; - bool can_swap = get_swappiness(lruvec, sc); + int swappiness = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { @@ -4574,22 +4124,20 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc { int gen; unsigned long birth; + int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); - /* see the comment on lru_gen_folio */ - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); - birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - - if (time_is_after_jiffies(birth + min_ttl)) + if (mem_cgroup_below_min(NULL, memcg)) return false; if (!lruvec_is_sizable(lruvec, sc)) return false; - mem_cgroup_calculate_protection(NULL, memcg); + gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); - return !mem_cgroup_below_min(NULL, memcg); + return time_is_before_jiffies(birth + min_ttl); } /* to protect the working set of the last N jiffies */ @@ -4599,23 +4147,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + bool reclaimable = !min_ttl; VM_WARN_ON_ONCE(!current_is_kswapd()); - /* check the order to exclude compaction-induced reclaim */ - if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) - return; + set_initial_priority(pgdat, sc); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { - mem_cgroup_iter_break(NULL, memcg); - return; - } + mem_cgroup_calculate_protection(NULL, memcg); - cond_resched(); + if (!reclaimable) + reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); /* @@ -4623,7 +4168,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min. */ - if (mutex_trylock(&oom_lock)) { + if (!reclaimable && mutex_trylock(&oom_lock)) { struct oom_control oc = { .gfp_mask = sc->gfp_mask, }; @@ -4645,33 +4190,47 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; + bool dirty; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; - int young = 0; + struct folio *last = NULL; + int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; + struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + if (!ptep_clear_young_notify(vma, addr, pte)) + return false; + if (spin_is_contended(pvmw->ptl)) - return; + return true; + + /* exclude special VMAs containing anon pages from COW */ + if (vma->vm_flags & VM_SPECIAL) + return true; /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; - start = max(addr & PMD_MASK, pvmw->vma->vm_start); - end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + start = max(addr & PMD_MASK, vma->vm_start); + end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; + + if (end - start == PAGE_SIZE) + return true; if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) @@ -4684,10 +4243,6 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) - return; - arch_enter_lazy_mmu_mode(); pte -= (addr - start) / PAGE_SIZE; @@ -4696,48 +4251,39 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) unsigned long pfn; pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(ptent, pvmw->vma, addr); + pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) - continue; - - folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; - if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); - - young++; - - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + if (!ptep_clear_young_notify(vma, addr, pte + i)) + continue; - if (walk) { - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - continue; + last = folio; + dirty = false; } - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) - folio_activate(folio); + if (pte_dirty(ptent)) + dirty = true; + + young++; } + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); - mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ - if (suitable_to_scan(i, young)) - update_bloom_filter(lruvec, max_seq, pvmw->pmd); + if (mm_state && suitable_to_scan(i, young)) + update_bloom_filter(mm_state, max_seq, pvmw->pmd); + + return true; } /****************************************************************************** @@ -4753,13 +4299,6 @@ enum { MEMCG_LRU_YOUNG, }; -#ifdef CONFIG_MEMCG - -static int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return READ_ONCE(lruvec->lrugen.seg); -} - static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) { int seg; @@ -4787,6 +4326,9 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) else VM_WARN_ON_ONCE(true); + WRITE_ONCE(lruvec->lrugen.seg, seg); + WRITE_ONCE(lruvec->lrugen.gen, new); + hlist_nulls_del_rcu(&lruvec->lrugen.list); if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) @@ -4797,15 +4339,14 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) pgdat->memcg_lru.nr_memcgs[old]--; pgdat->memcg_lru.nr_memcgs[new]++; - lruvec->lrugen.gen = new; - WRITE_ONCE(lruvec->lrugen.seg, seg); - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); } +#ifdef CONFIG_MEMCG + void lru_gen_online_memcg(struct mem_cgroup *memcg) { int gen; @@ -4822,11 +4363,11 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg) gen = get_memcg_gen(pgdat->memcg_lru.seq); + lruvec->lrugen.gen = gen; + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); pgdat->memcg_lru.nr_memcgs[gen]++; - lruvec->lrugen.gen = gen; - spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -4853,16 +4394,17 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) spin_lock_irq(&pgdat->memcg_lru.lock); - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + if (hlist_nulls_unhashed(&lruvec->lrugen.list)) + goto unlock; gen = lruvec->lrugen.gen; - hlist_nulls_del_rcu(&lruvec->lrugen.list); + hlist_nulls_del_init_rcu(&lruvec->lrugen.list); pgdat->memcg_lru.nr_memcgs[gen]--; if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - +unlock: spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -4872,32 +4414,28 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) struct lruvec *lruvec = get_lruvec(memcg, nid); /* see the comment on MEMCG_NR_GENS */ - if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } -#else /* !CONFIG_MEMCG */ - -static int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} - -#endif +#endif /* CONFIG_MEMCG */ /****************************************************************************** * the eviction ******************************************************************************/ -static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) +static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, + int tier_idx) { bool success; + bool dirty, writeback; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4912,15 +4450,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) return true; } - /* dirty lazyfree */ - if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { - success = lru_gen_del_folio(lruvec, folio, true); - VM_WARN_ON_ONCE_FOLIO(!success, folio); - folio_set_swapbacked(folio); - lruvec_add_folio_tail(lruvec, folio); - return true; - } - /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { list_move(&folio->lru, &lrugen->folios[gen][type][zone]); @@ -4928,20 +4457,37 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) } /* protected */ - if (tier > tier_idx) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { + gen = folio_inc_gen(lruvec, folio, false); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + return true; + } + + /* ineligible */ + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); - - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], - lrugen->protected[hist][type][tier - 1] + delta); return true; } + dirty = folio_test_dirty(folio); + writeback = folio_test_writeback(folio); + if (type == LRU_GEN_FILE && dirty) { + sc->nr.file_taken += delta; + if (!writeback) + sc->nr.unqueued_dirty += delta; + } + /* waiting for writeback */ - if (folio_test_locked(folio) || folio_test_writeback(folio) || - (type == LRU_GEN_FILE && folio_test_dirty(folio))) { + if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4954,7 +4500,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* swapping inhibited */ + /* swap constrained */ if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) @@ -4970,13 +4516,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return false; } - /* see the comment on MAX_NR_TIERS */ + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); - folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4984,15 +4529,18 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return true; } -static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, - int type, int tier, struct list_head *list) +static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int type, int tier, + struct list_head *list) { - int gen, zone; + int i; + int gen; enum vm_event_item item; int sorted = 0; int scanned = 0; int isolated = 0; - int remaining = MAX_LRU_BATCH; + int skipped = 0; + int remaining = min(nr_to_scan, MAX_LRU_BATCH); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -5003,9 +4551,10 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, gen = lru_gen_from_seq(lrugen->min_seq[type]); - for (zone = sc->reclaim_idx; zone >= 0; zone--) { + for (i = MAX_NR_ZONES; i > 0; i--) { LIST_HEAD(moved); - int skipped = 0; + int skipped_zone = 0; + int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { @@ -5019,38 +4568,43 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, scanned += delta; - if (sort_folio(lruvec, folio, tier)) + if (sort_folio(lruvec, folio, sc, tier)) sorted += delta; else if (isolate_folio(lruvec, folio, sc)) { list_add(&folio->lru, list); isolated += delta; } else { list_move(&folio->lru, &moved); - skipped += delta; + skipped_zone += delta; } - if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH) break; } - if (skipped) { + if (skipped_zone) { list_splice(&moved, head); - __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone); + skipped += skipped_zone; } if (!remaining || isolated >= MIN_LRU_BATCH) break; } - item = PGSCAN_KSWAPD + reclaimer_offset(); + item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); } - __count_memcg_events(memcg, item, isolated); - __count_memcg_events(memcg, PGREFILL, sorted); + count_memcg_events(memcg, item, isolated); + count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); - + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, + scanned, skipped, isolated, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); + if (type == LRU_GEN_FILE) + sc->nr.file_taken += isolated; /* * There might not be eligible folios due to reclaim_idx. Check the * remaining to prevent livelock if it's not making progress. @@ -5064,13 +4618,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type) struct ctrl_pos sp, pv; /* - * To leave a margin for fluctuations, use a larger gain factor (1:2). + * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ - read_ctrl_pos(lruvec, type, 0, 1, &sp); + read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, 2, &pv); + read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } @@ -5078,77 +4632,50 @@ static int get_tier_idx(struct lruvec *lruvec, int type) return tier - 1; } -static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - int type, tier; struct ctrl_pos sp, pv; - int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + if (swappiness <= MIN_SWAPPINESS + 1) + return LRU_GEN_FILE; + + if (swappiness >= MAX_SWAPPINESS) + return LRU_GEN_ANON; /* - * Compare the first tier of anon with that of file to determine which - * type to scan. Also need to compare other tiers of the selected type - * with the first tier of the other type to determine the last tier (of - * the selected type) to evict. + * Compare the sum of all tiers of anon with that of file to determine + * which type to scan. */ - read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); - read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); - type = positive_ctrl_err(&sp, &pv); - - read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); - for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, gain[type], &pv); - if (!positive_ctrl_err(&sp, &pv)) - break; - } - - *tier_idx = tier - 1; + read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); - return type; + return positive_ctrl_err(&sp, &pv); } -static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, +static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; - int type; - int scanned; - int tier = -1; - DEFINE_MIN_SEQ(lruvec); + int type = get_type_to_scan(lruvec, swappiness); - /* - * Try to make the obvious choice first. When anon and file are both - * available from the same generation, interpret swappiness 1 as file - * first and 200 as anon first. - */ - if (!swappiness) - type = LRU_GEN_FILE; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) - type = LRU_GEN_FILE; - else if (swappiness == 200) - type = LRU_GEN_ANON; - else - type = get_type_to_scan(lruvec, swappiness, &tier); + for_each_evictable_type(i, swappiness) { + int scanned; + int tier = get_tier_idx(lruvec, type); - for (i = !swappiness; i < ANON_AND_FILE; i++) { - if (tier < 0) - tier = get_tier_idx(lruvec, type); + *type_scanned = type; - scanned = scan_folios(lruvec, sc, type, tier, list); + scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list); if (scanned) - break; + return scanned; type = !type; - tier = -1; } - *type_scanned = type; - - return scanned; + return 0; } -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, int swappiness) { int type; int scanned; @@ -5161,16 +4688,17 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); spin_lock_irq(&lruvec->lru_lock); - scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); + scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list); scanned += try_to_inc_min_seq(lruvec, swappiness); - if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); @@ -5178,36 +4706,32 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; retry: - reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg); + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + scanned, reclaimed, &stat, sc->priority, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); + /* retry folios that may have missed folio_rotate_reclaimable() */ + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); continue; } - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); - sc->nr_scanned -= folio_nr_pages(folio); + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); @@ -5215,21 +4739,22 @@ retry: move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; - if (walk && walk->batched) - reset_batch_size(lruvec, walk); + if (walk && walk->batched) { + walk->lruvec = lruvec; + reset_batch_size(walk); + } + + mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc), + stat.nr_demoted); - item = PGSTEAL_KSWAPD + reclaimer_offset(); + item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); - __count_memcg_events(memcg, item, reclaimed); + count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); - mem_cgroup_uncharge_list(&list); - free_unref_page_list(&list); - - INIT_LIST_HEAD(&list); list_splice_init(&clean, &list); if (!list_empty(&list)) { @@ -5241,65 +4766,32 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) + int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; + unsigned long size = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; + *nr_to_scan = 0; + /* have to run aging, since eviction is not possible anymore */ + if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - } - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; } } - /* try to scrape all its memory if this memcg was deleted */ - *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; + *nr_to_scan = size; + /* better to run aging even though eviction is still possible */ + return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } /* @@ -5307,46 +4799,69 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { + bool success; unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) - return 0; + return -1; - if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) - return nr_to_scan; + success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); - /* skip the aging path at the default priority */ - if (sc->priority == DEF_PRIORITY) + /* try to scrape all its memory if this memcg was deleted */ + if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; - /* skip this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; + nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan); + + /* try to get away with not aging at the default priority */ + if (!success || sc->priority == DEF_PRIORITY) + return nr_to_scan >> sc->priority; + + /* stop scanning this lruvec as it's low on cold folios */ + return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; } -static unsigned long get_nr_to_reclaim(struct scan_control *sc) +static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) { + int i; + enum zone_watermarks mark; + /* don't abort memcg reclaim to ensure fairness */ if (!root_reclaim(sc)) - return -1; + return false; + + if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) + return true; + + /* check the order to exclude compaction-induced reclaim */ + if (!current_is_kswapd() || sc->order) + return false; - return max(sc->nr_to_reclaim, compact_gap(sc->order)); + mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ? + WMARK_PROMO : WMARK_HIGH; + + for (i = 0; i <= sc->reclaim_idx; i++) { + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH; + + if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) + return false; + } + + /* kswapd should abort if all eligible zones are safe */ + return true; } static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { long nr_to_scan; unsigned long scanned = 0; - unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); int swappiness = get_swappiness(lruvec, sc); - /* clean file folios are more likely to exist */ - if (swappiness && !(sc->gfp_mask & __GFP_IO)) - swappiness = 1; - while (true) { int delta; @@ -5354,7 +4869,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (nr_to_scan <= 0) break; - delta = evict_folios(lruvec, sc, swappiness); + delta = evict_folios(nr_to_scan, lruvec, sc, swappiness); if (!delta) break; @@ -5362,13 +4877,20 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (scanned >= nr_to_scan) break; - if (sc->nr_reclaimed >= nr_to_reclaim) + if (should_abort_scan(lruvec, sc)) break; cond_resched(); } - /* whether try_to_inc_max_seq() was successful */ + /* + * If too many file cache in the coldest generation can't be evicted + * due to being dirty, wake up the flusher. + */ + if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) + wakeup_flusher_threads(WB_REASON_VMSCAN); + + /* whether this lruvec should be rotated */ return nr_to_scan < 0; } @@ -5377,22 +4899,16 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) bool success; unsigned long scanned = sc->nr_scanned; unsigned long reclaimed = sc->nr_reclaimed; - int seg = lru_gen_memcg_seg(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - /* see the comment on MEMCG_NR_GENS */ - if (!lruvec_is_sizable(lruvec, sc)) - return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; - - mem_cgroup_calculate_protection(NULL, memcg); - + /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG; if (mem_cgroup_below_low(NULL, memcg)) { /* see the comment on MEMCG_NR_GENS */ - if (seg != MEMCG_LRU_TAIL) + if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) return MEMCG_LRU_TAIL; memcg_memory_event(memcg, MEMCG_LOW); @@ -5408,10 +4924,16 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) flush_reclaim_state(sc); - return success ? MEMCG_LRU_YOUNG : 0; -} + if (success && mem_cgroup_online(memcg)) + return MEMCG_LRU_YOUNG; -#ifdef CONFIG_MEMCG + if (!success && lruvec_is_sizable(lruvec, sc)) + return 0; + + /* one retry if offlined or too small */ + return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ? + MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; +} static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { @@ -5422,28 +4944,33 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) struct lruvec *lruvec; struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; - const struct hlist_nulls_node *pos; - unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + struct hlist_nulls_node *pos; + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: op = 0; memcg = NULL; - gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); rcu_read_lock(); hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { - if (op) + if (op) { lru_gen_rotate_memcg(lruvec, op); + op = 0; + } mem_cgroup_put(memcg); + memcg = NULL; + + if (gen != READ_ONCE(lrugen->gen)) + continue; lruvec = container_of(lrugen, struct lruvec, lrugen); memcg = lruvec_memcg(lruvec); if (!mem_cgroup_tryget(memcg)) { - op = 0; + lru_gen_release_memcg(memcg); memcg = NULL; continue; } @@ -5454,7 +4981,7 @@ restart: rcu_read_lock(); - if (sc->nr_reclaimed >= nr_to_reclaim) + if (should_abort_scan(lruvec, sc)) break; } @@ -5465,7 +4992,7 @@ restart: mem_cgroup_put(memcg); - if (sc->nr_reclaimed >= nr_to_reclaim) + if (!is_a_nulls(pos)) return; /* restart if raced with lru_gen_rotate_memcg() */ @@ -5499,45 +5026,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc blk_finish_plug(&plug); } -#else /* !CONFIG_MEMCG */ - -static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) -{ - BUILD_BUG(); -} - -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -{ - BUILD_BUG(); -} - -#endif - -static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) -{ - int priority; - unsigned long reclaimable; - struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); - - if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) - return; - /* - * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> - * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the - * estimated reclaimed_to_scanned_ratio = inactive / total. - */ - reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_swappiness(lruvec, sc)) - reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); - - reclaimable /= MEMCG_NR_GENS; - - /* round down reclaimable and round up sc->nr_to_reclaim */ - priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); - - sc->priority = clamp(priority, 0, DEF_PRIORITY); -} - static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { struct blk_plug plug; @@ -5576,8 +5064,8 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: - /* kswapd should never fail */ - pgdat->kswapd_failures = 0; + if (sc->nr_reclaimed > reclaimed) + atomic_set(&pgdat->kswapd_failures, 0); } /****************************************************************************** @@ -5862,23 +5350,23 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, int type, tier; int hist = lru_hist_from_seq(seq); struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); for (type = 0; type < ANON_AND_FILE; type++) { - const char *s = " "; + const char *s = "xxx"; unsigned long n[3] = {}; if (seq == max_seq) { - s = "RT "; + s = "RTx"; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); } for (i = 0; i < 3; i++) @@ -5887,17 +5375,20 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_putc(m, '\n'); } + if (!mm_state) + return; + seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { - const char *s = " "; + const char *s = "xxxx"; unsigned long n = 0; if (seq == max_seq && NR_HIST_GENS == 1) { - s = "LOYNFA"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + s = "TYFA"; + n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { - s = "loynfa"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + s = "tyfa"; + n = READ_ONCE(mm_state->stats[hist][i]); } seq_printf(m, " %10lu%c", n, s[i]); @@ -5909,7 +5400,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, static int lru_gen_seq_show(struct seq_file *m, void *v) { unsigned long seq; - bool full = !debugfs_real_fops(m->file)->write; + bool full = debugfs_get_aux_num(m->file); struct lruvec *lruvec = v; struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; @@ -5930,7 +5421,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) seq_printf(m, " node %5d\n", nid); if (!full) - seq = min_seq[LRU_GEN_ANON]; + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else @@ -5969,24 +5460,15 @@ static const struct seq_operations lru_gen_seq_ops = { .show = lru_gen_seq_show, }; -static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, - bool can_swap, bool force_scan) +static int run_aging(struct lruvec *lruvec, unsigned long seq, + int swappiness, bool force_scan) { DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - - if (seq < max_seq) - return 0; if (seq > max_seq) return -EINVAL; - if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) - return -ERANGE; - - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); - - return 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, @@ -6002,13 +5484,14 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); - if (seq < min_seq[!swappiness]) + if (seq < evictable_min_seq(min_seq, swappiness)) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) return 0; - if (!evict_folios(lruvec, sc, swappiness)) + if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc, + swappiness)) return 0; cond_resched(); @@ -6043,16 +5526,17 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (memcg_id != mem_cgroup_id(memcg)) goto done; + sc->target_mem_cgroup = memcg; lruvec = get_lruvec(memcg, nid); - if (swappiness < 0) + if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); - else if (swappiness > 200) + else if (swappiness > SWAPPINESS_ANON_ONLY) goto done; switch (cmd) { case '+': - err = run_aging(lruvec, seq, sc, swappiness, opt); + err = run_aging(lruvec, seq, swappiness, opt); break; case '-': err = run_eviction(lruvec, seq, sc, swappiness, opt); @@ -6079,6 +5563,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, .may_swap = true, .reclaim_idx = MAX_NR_ZONES - 1, .gfp_mask = GFP_KERNEL, + .proactive = true, }; buf = kvmalloc(len + 1, GFP_KERNEL); @@ -6104,24 +5589,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, while ((cur = strsep(&next, ",;\n"))) { int n; int end; - char cmd; + char cmd, swap_string[5]; unsigned int memcg_id; unsigned int nid; unsigned long seq; - unsigned int swappiness = -1; + unsigned int swappiness; unsigned long opt = -1; cur = skip_spaces(cur); if (!*cur) continue; - n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, - &seq, &end, &swappiness, &end, &opt, &end); + n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, swap_string, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; break; } + if (n == 4) { + swappiness = -1; + } else if (!strcmp("max", swap_string)) { + /* set by userspace for anonymous memory only */ + swappiness = SWAPPINESS_ANON_ONLY; + } else { + err = kstrtouint(swap_string, 0, &swappiness); + if (err) + break; + } + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); if (err) break; @@ -6161,11 +5657,24 @@ static const struct file_operations lru_gen_ro_fops = { * initialization ******************************************************************************/ +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ + int i, j; + + spin_lock_init(&pgdat->memcg_lru.lock); + + for (i = 0; i < MEMCG_NR_GENS; i++) { + for (j = 0; j < MEMCG_NR_BINS; j++) + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); + } +} + void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; int gen, type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); @@ -6176,47 +5685,46 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); - lruvec->mm_state.seq = MIN_NR_GENS; + if (mm_state) + mm_state->seq = MIN_NR_GENS; } #ifdef CONFIG_MEMCG -void lru_gen_init_pgdat(struct pglist_data *pgdat) +void lru_gen_init_memcg(struct mem_cgroup *memcg) { - int i, j; - - spin_lock_init(&pgdat->memcg_lru.lock); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - for (i = 0; i < MEMCG_NR_GENS; i++) { - for (j = 0; j < MEMCG_NR_BINS; j++) - INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); - } -} + if (!mm_list) + return; -void lru_gen_init_memcg(struct mem_cgroup *memcg) -{ - INIT_LIST_HEAD(&memcg->mm_list.fifo); - spin_lock_init(&memcg->mm_list.lock); + INIT_LIST_HEAD(&mm_list->fifo); + spin_lock_init(&mm_list->lock); } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { int i; int nid; + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); lruvec->lrugen.list.next = LIST_POISON1; + if (!mm_state) + continue; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { - bitmap_free(lruvec->mm_state.filters[i]); - lruvec->mm_state.filters[i] = NULL; + bitmap_free(mm_state->filters[i]); + mm_state->filters[i] = NULL; } } } @@ -6231,8 +5739,10 @@ static int __init init_lru_gen(void) if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); - debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); - debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + debugfs_create_file_aux_num("lru_gen", 0644, NULL, NULL, false, + &lru_gen_rw_fops); + debugfs_create_file_aux_num("lru_gen_full", 0444, NULL, NULL, true, + &lru_gen_ro_fops); return 0; }; @@ -6242,14 +5752,17 @@ late_initcall(init_lru_gen); static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { + BUILD_BUG(); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { + BUILD_BUG(); } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { + BUILD_BUG(); } #endif /* CONFIG_LRU_GEN */ @@ -6366,7 +5879,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + if (can_age_anon_pages(lruvec, sc) && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -6375,7 +5888,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -6397,6 +5910,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long pages_for_compaction; unsigned long inactive_lru_pages; int z; + struct zone *zone; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) @@ -6416,17 +5930,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; /* If compaction would go ahead or the allocation would succeed, stop */ - for (z = 0; z <= sc->reclaim_idx; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { + unsigned long watermark = min_wmark_pages(zone); /* Allocation can already succeed, nothing to do */ - if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + if (zone_watermark_ok(zone, sc->order, watermark, sc->reclaim_idx, 0)) return false; - if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) + if (compaction_suitable(zone, sc->order, watermark, + sc->reclaim_idx)) return false; } @@ -6445,9 +5958,25 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .pgdat = pgdat, + }; + struct mem_cgroup_reclaim_cookie *partial = &reclaim; struct mem_cgroup *memcg; - memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + /* + * In most cases, direct reclaimers can do partial walks + * through the cgroup tree, using an iterator state that + * persists across invocations. This strikes a balance between + * fairness and allocation latency. + * + * For kswapd, reliable forward progress is more important + * than a quick return to idle. Always do full walks. + */ + if (current_is_kswapd() || sc->memcg_full_walk) + partial = NULL; + + memcg = mem_cgroup_iter(target_memcg, NULL, partial); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; @@ -6497,7 +6026,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); - } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + /* If partial walks are allowed, bail once goal is reached */ + if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(target_memcg, memcg); + break; + } + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial))); } static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) @@ -6507,6 +6041,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) bool reclaimable = false; if (lru_gen_enabled() && root_reclaim(sc)) { + memset(&sc->nr, 0, sizeof(sc->nr)); lru_gen_shrink_node(pgdat, sc); return; } @@ -6519,7 +6054,7 @@ again: nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - prepare_scan_count(pgdat, sc); + prepare_scan_control(pgdat, sc); shrink_node_memcgs(pgdat, sc); @@ -6556,10 +6091,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it @@ -6608,7 +6139,9 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); + else if (sc->cache_trim_mode) + sc->cache_trim_mode_failed = 1; } /* @@ -6620,27 +6153,29 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; + if (!gfp_compaction_allowed(sc->gfp_mask)) + return false; + /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) return true; - /* Compaction cannot yet proceed. Do reclaim. */ - if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) - return false; - /* - * Compaction is already possible, but it takes time to run and there - * are potentially other callers using the pages just freed. So proceed - * with reclaim to make a buffer of free pages available to give - * compaction a reasonable chance of completing and allocating the page. + * Direct reclaim usually targets the min watermark, but compaction + * takes time to run and there are potentially other callers using the + * pages just freed. So target a higher buffer to give compaction a + * reasonable chance of completing and allocating the pages. + * * Note that we won't actually reclaim the whole buffer in one attempt * as the target watermark in should_continue_reclaim() is lower. But if * we are already above the high+gap watermark, don't reclaim at all. */ - watermark = high_wmark_pages(zone) + compact_gap(sc->order); + watermark = high_wmark_pages(zone); + if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx)) + return true; - return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); + return false; } static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) @@ -6745,9 +6280,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, - sc->order, sc->gfp_mask, - &nr_soft_scanned); + nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat, + sc->order, sc->gfp_mask, + &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ @@ -6866,6 +6401,21 @@ retry: return 1; /* + * In most cases, direct reclaimers can do partial walks + * through the cgroup tree to meet the reclaim goal while + * keeping latency low. Since the iterator state is shared + * among all direct reclaim invocations (to retain fairness + * among cgroups), though, high concurrency can result in + * individual threads not seeing enough cgroups to make + * meaningful forward progress. Avoid false OOMs in this case. + */ + if (!sc->memcg_full_walk) { + sc->priority = initial_priority; + sc->memcg_full_walk = 1; + goto retry; + } + + /* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a * memory.low cgroup setting can exempt large amounts of @@ -6901,15 +6451,11 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; - for (i = 0; i <= ZONE_NORMAL; i++) { - zone = &pgdat->node_zones[i]; - if (!managed_zone(zone)) - continue; - - if (!zone_reclaimable_pages(zone)) + for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { + if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES)) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -7042,7 +6588,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ - BUILD_BUG_ON(MAX_ORDER >= S8_MAX); + BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); @@ -7110,12 +6656,14 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + int *swappiness) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .proactive_swappiness = swappiness, .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, @@ -7145,6 +6693,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, return nr_reclaimed; } +#else +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options, + int *swappiness) +{ + return 0; +} #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) @@ -7157,10 +6714,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) return; } - if (!can_age_anon_pages(pgdat, sc)) + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!can_age_anon_pages(lruvec, sc)) return; - lruvec = mem_cgroup_lruvec(NULL, pgdat); if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) return; @@ -7211,17 +6768,48 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + enum zone_stat_item item; + unsigned long free_pages; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) - mark = wmark_pages(zone, WMARK_PROMO); + mark = promo_wmark_pages(zone); else mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) + + /* + * In defrag_mode, watermarks must be met in whole + * blocks to avoid polluting allocator fallbacks. + * + * However, kswapd usually cannot accomplish this on + * its own and needs kcompactd support. Once it's + * reclaimed a compaction gap, and kswapd_shrink_node + * has dropped order, simply ensure there are enough + * base pages for compaction, wake kcompactd & sleep. + */ + if (defrag_mode && order) + item = NR_FREE_PAGES_BLOCKS; + else + item = NR_FREE_PAGES; + + /* + * When there is a high number of CPUs in the system, + * the cumulative error from the vmstat per-cpu cache + * can blur the line between the watermarks. In that + * case, be safe and get an accurate snapshot. + * + * TODO: NR_FREE_PAGES_BLOCKS moves in steps of + * pageblock_nr_pages, while the vmstat pcp threshold + * is limited to 125. On many configurations that + * counter won't actually be per-cpu cached. But keep + * things simple for now; revisit when somebody cares. + */ + free_pages = zone_page_state(zone, item); + if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) + free_pages = zone_page_state_snapshot(zone, item); + + if (__zone_watermark_ok(zone, order, mark, highest_zoneidx, + 0, free_pages)) return true; } @@ -7243,7 +6831,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat) clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); - clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } @@ -7273,7 +6860,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7297,14 +6884,11 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, { struct zone *zone; int z; + unsigned long nr_reclaimed = sc->nr_reclaimed; /* Reclaim a number of pages proportional to the number of zones */ sc->nr_to_reclaim = 0; - for (z = 0; z <= sc->reclaim_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); } @@ -7324,7 +6908,8 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; - return sc->nr_scanned >= sc->nr_to_reclaim; + /* account for progress from mm_account_reclaimed_pages() */ + return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim; } /* Page allocator PCP high watermark is lowered if reclaim is active. */ @@ -7334,12 +6919,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) int i; struct zone *zone; - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { if (active) set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); else @@ -7400,11 +6980,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { nr_boost_reclaim += zone->watermark_boost; zone_boosts[i] = zone->watermark_boost; } @@ -7418,6 +6994,7 @@ restart: bool raise_priority = true; bool balanced; bool ret; + bool was_frozen; sc.reclaim_idx = highest_zoneidx; @@ -7493,8 +7070,8 @@ restart: /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, - sc.gfp_mask, &nr_soft_scanned); + nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, sc.order, + sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; /* @@ -7516,9 +7093,9 @@ restart: /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); - ret = try_to_freeze(); + ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); - if (ret || kthread_should_stop()) + if (was_frozen || ret) break; /* @@ -7540,8 +7117,23 @@ restart: sc.priority--; } while (sc.priority >= 1); - if (!sc.nr_reclaimed) - pgdat->kswapd_failures++; + /* + * Restart only if it went through the priority loop all the way, + * but cache_trim_mode didn't work. + */ + if (!sc.nr_reclaimed && sc.priority < 1 && + !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) { + sc.no_cache_trim_mode = 1; + goto restart; + } + + /* + * If the reclaim was boosted, we might still be far from the + * watermark_high at this point. We need to avoid increasing the + * failure count to prevent the kswapd thread from stopping. + */ + if (!sc.nr_reclaimed && !boosted) + atomic_inc(&pgdat->kswapd_failures); out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7700,10 +7292,6 @@ static int kswapd(void *p) unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); /* * Tell the memory management that we're a "memory allocator", @@ -7724,7 +7312,7 @@ static int kswapd(void *p) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { - bool ret; + bool was_frozen; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, @@ -7741,15 +7329,14 @@ kswapd_try_sleep: WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); - ret = try_to_freeze(); - if (kthread_should_stop()) + if (kthread_freezable_should_stop(&was_frozen)) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (ret) + if (was_frozen) continue; /* @@ -7805,7 +7392,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* @@ -7873,12 +7460,15 @@ void __meminit kswapd_run(int nid) pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) { - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ + pr_err("Failed to start kswapd on node %d,ret=%ld\n", + nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); - pr_err("Failed to start kswapd on node %d\n", nid); pgdat->kswapd = NULL; + } else { + wake_up_process(pgdat->kswapd); } } pgdat_kswapd_unlock(pgdat); @@ -7902,6 +7492,28 @@ void __meminit kswapd_stop(int nid) pgdat_kswapd_unlock(pgdat); } +static const struct ctl_table vmscan_sysctl_table[] = { + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO_HUNDRED, + }, +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +#endif +}; + static int __init kswapd_init(void) { int nid; @@ -7909,6 +7521,7 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); + register_sysctl_init("vm", vmscan_sysctl_table); return 0; } @@ -7973,9 +7586,11 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) else nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); - /* If we can't clean pages, remove dirty pages from consideration */ - if (!(node_reclaim_mode & RECLAIM_WRITE)) - delta += node_page_state(pgdat, NR_FILE_DIRTY); + /* + * Since we can't clean folios through reclaim, remove dirty file + * folios from consideration. + */ + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) @@ -7987,35 +7602,26 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) /* * Try to free up some pages from this node through reclaim. */ -static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) +static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, + unsigned long nr_pages, + struct scan_control *sc) { - /* Minimum pages needed in order to stay on node */ - const unsigned long nr_pages = 1 << order; struct task_struct *p = current; unsigned int noreclaim_flag; - struct scan_control sc = { - .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = current_gfp_context(gfp_mask), - .order = order, - .priority = NODE_RECLAIM_PRIORITY, - .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), - .may_swap = 1, - .reclaim_idx = gfp_zone(gfp_mask), - }; unsigned long pflags; - trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, - sc.gfp_mask); + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, sc->order, + sc->gfp_mask); cond_resched(); psi_memstall_enter(&pflags); - fs_reclaim_acquire(sc.gfp_mask); + delayacct_freepages_start(); + fs_reclaim_acquire(sc->gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ noreclaim_flag = memalloc_noreclaim_save(); - set_task_reclaim_state(p, &sc.reclaim_state); + set_task_reclaim_state(p, &sc->reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { @@ -8024,23 +7630,36 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { - shrink_node(pgdat, &sc); - } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); + shrink_node(pgdat, sc); + } while (sc->nr_reclaimed < nr_pages && --sc->priority >= 0); } set_task_reclaim_state(p, NULL); memalloc_noreclaim_restore(noreclaim_flag); - fs_reclaim_release(sc.gfp_mask); + fs_reclaim_release(sc->gfp_mask); + delayacct_freepages_end(); psi_memstall_leave(&pflags); - trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed); - return sc.nr_reclaimed >= nr_pages; + return sc->nr_reclaimed; } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { int ret; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = current_gfp_context(gfp_mask), + .order = order, + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), + .may_swap = 1, + .reclaim_idx = gfp_zone(gfp_mask), + }; /* * Node reclaim reclaims unmapped file backed pages and @@ -8072,17 +7691,127 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; - if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; - ret = __node_reclaim(pgdat, gfp_mask, order); - clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages; + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); - if (!ret) + if (ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS); + else count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); return ret; } + +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, + MEMORY_RECLAIM_NULL, +}; +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + gfp_t gfp_mask = GFP_KERNEL; + + if (!buf || (!memcg && !pgdat) || (memcg && pgdat)) + return -EINVAL; + + buf = strstrip(buf); + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || + swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; + default: + return -EINVAL; + } + } + + while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages. + */ + if (!nr_retries) + lru_add_drain_all(); + + if (memcg) { + unsigned int reclaim_options; + + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + reclaimed = try_to_free_mem_cgroup_pages(memcg, + batch_size, gfp_mask, + reclaim_options, + swappiness == -1 ? NULL : &swappiness); + } else { + struct scan_control sc = { + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), + .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), + .may_unmap = 1, + .may_swap = 1, + .proactive = 1, + }; + + if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, + &pgdat->flags)) + return -EBUSY; + + reclaimed = __node_reclaim(pgdat, gfp_mask, + batch_size, &sc); + clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + } + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return 0; +} + #endif /** @@ -8130,3 +7859,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch) } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t reclaim_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret, nid = dev->id; + + ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid)); + return ret ? -EAGAIN : count; +} + +static DEVICE_ATTR_WO(reclaim); +int reclaim_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_reclaim); +} + +void reclaim_unregister_node(struct node *node) +{ + return device_remove_file(&node->dev, &dev_attr_reclaim); +} +#endif |
