diff options
Diffstat (limited to 'mm/workingset.c')
| -rw-r--r-- | mm/workingset.c | 406 |
1 files changed, 297 insertions, 109 deletions
diff --git a/mm/workingset.c b/mm/workingset.c index 8c03afe1d67c..e9f05634747a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,6 +16,7 @@ #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "internal.h" /* * Double CLOCK lists @@ -111,9 +112,20 @@ * * NR_inactive + (R - E) <= NR_inactive + NR_active * - * which can be further simplified to + * If we have swap we should consider about NR_inactive_anon and + * NR_active_anon, so for page cache and anonymous respectively: * - * (R - E) <= NR_active + * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file + * + NR_inactive_anon + NR_active_anon + * + * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon + * + NR_inactive_file + NR_active_file + * + * Which can be further simplified to: + * + * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon + * + * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list @@ -130,14 +142,14 @@ * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at - * least (R - E) active pages, the refaulting page is activated - * optimistically in the hope that (R - E) active pages are actually + * least (R - E) pages in the userspace workingset, the refaulting page + * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put - * pressure on the current active list. + * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently @@ -187,7 +199,6 @@ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { - eviction >>= bucket_order; eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; @@ -212,10 +223,125 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); - *evictionp = entry << bucket_order; + *evictionp = entry; *workingsetp = workingset; } +#ifdef CONFIG_LRU_GEN + +static void *lru_gen_eviction(struct folio *folio) +{ + int hist; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + int type = folio_is_file_lru(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); + + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->lrugen; + min_seq = READ_ONCE(lrugen->min_seq[type]); + token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); + + hist = lru_hist_from_seq(min_seq); + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); +} + +/* + * Tests if the shadow entry is for a folio that was recently evicted. + * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. + */ +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, + unsigned long *token, bool *workingset) +{ + int memcg_id; + unsigned long max_seq; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + + unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); + + memcg = mem_cgroup_from_id(memcg_id); + *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + + return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; +} + +static void lru_gen_refault(struct folio *folio, void *shadow) +{ + bool recent; + int hist, tier, refs; + bool workingset; + unsigned long token; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + int type = folio_is_file_lru(folio); + int delta = folio_nr_pages(folio); + + rcu_read_lock(); + + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); + if (lruvec != folio_lruvec(folio)) + goto unlock; + + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); + + if (!recent) + goto unlock; + + lrugen = &lruvec->lrugen; + + hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; + tier = lru_tier_from_refs(refs, workingset); + + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); + + /* see folio_add_lru() where folio_set_active() will be called */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + + if (workingset) { + folio_set_workingset(folio); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); + } else + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); +unlock: + rcu_read_unlock(); +} + +#else /* !CONFIG_LRU_GEN */ + +static void *lru_gen_eviction(struct folio *folio) +{ + return NULL; +} + +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, + unsigned long *token, bool *workingset) +{ + return false; +} + +static void lru_gen_refault(struct folio *folio, void *shadow) +{ +} + +#endif /* CONFIG_LRU_GEN */ + /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged @@ -245,61 +371,75 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) } /** - * workingset_eviction - note the eviction of a page from memory + * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim - * @page: the page being evicted + * @folio: the folio being evicted * - * Return: a shadow entry to be stored in @page->mapping->i_pages in place - * of the evicted @page so that a later refault can be detected. + * Return: a shadow entry to be stored in @folio->mapping->i_pages in place + * of the evicted @folio so that a later refault can be detected. */ -void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) +void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; - /* Page is fully exclusive and pins page's memory cgroup pointer */ - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); + /* Folio is fully exclusive and pins folio's memory cgroup pointer */ + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (lru_gen_enabled()) + return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + eviction >>= bucket_order; + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + return pack_shadow(memcgid, pgdat, eviction, + folio_test_workingset(folio)); } /** - * workingset_refault - Evaluate the refault of a previously evicted folio. - * @folio: The freshly allocated replacement folio. - * @shadow: Shadow entry of the evicted folio. - * - * Calculates and evaluates the refault distance of the previously - * evicted folio in the context of the node and the memcg whose memory - * pressure caused the eviction. + * workingset_test_recent - tests if the shadow entry is for a folio that was + * recently evicted. Also fills in @workingset with the value unpacked from + * shadow. + * @shadow: the shadow entry to be tested. + * @file: whether the corresponding folio is from the file lru. + * @workingset: where the workingset value unpacked from shadow should + * be stored. + * @flush: whether to flush cgroup rstat. + * + * Return: true if the shadow is for a recently evicted folio; false otherwise. */ -void workingset_refault(struct folio *folio, void *shadow) +bool workingset_test_recent(void *shadow, bool file, bool *workingset, + bool flush) { - bool file = folio_is_file_lru(folio); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; - struct pglist_data *pgdat; - struct mem_cgroup *memcg; - unsigned long eviction; - struct lruvec *lruvec; unsigned long refault; - bool workingset; int memcgid; - long nr; + struct pglist_data *pgdat; + unsigned long eviction; - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + if (lru_gen_enabled()) { + bool recent; + + rcu_read_lock(); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); + rcu_read_unlock(); + return recent; + } rcu_read_lock(); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); + eviction <<= bucket_order; + /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. @@ -317,8 +457,25 @@ void workingset_refault(struct folio *folio, void *shadow) * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_tryget(eviction_memcg)) + eviction_memcg = NULL; + rcu_read_unlock(); + if (!mem_cgroup_disabled() && !eviction_memcg) - goto out; + return false; + /* + * Flush stats (and potentially sleep) outside the RCU read section. + * + * Note that workingset_test_recent() itself might be called in RCU read + * section (for e.g, in cachestat) - these callers need to skip flushing + * stats (via the flush argument). + * + * XXX: With per-memcg flushing and thresholding, is ratelimiting + * still needed here? + */ + if (flush) + mem_cgroup_flush_stats_ratelimited(eviction_memcg); + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -341,33 +498,18 @@ void workingset_refault(struct folio *folio, void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; /* - * The activation decision for this folio is made at the level - * where the eviction occurred, as that is where the LRU order - * during folio reclaim is being determined. - * - * However, the cgroup that will own the folio is the one that - * is actually experiencing the refault event. - */ - nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - - mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - - mem_cgroup_flush_stats(); - /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends - * on having swap. + * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } - if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { @@ -375,8 +517,54 @@ void workingset_refault(struct folio *folio, void *shadow) NR_INACTIVE_ANON); } } - if (refault_distance > workingset_size) - goto out; + + mem_cgroup_put(eviction_memcg); + return refault_distance <= workingset_size; +} + +/** + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. + * + * Calculates and evaluates the refault distance of the previously + * evicted folio in the context of the node and the memcg whose memory + * pressure caused the eviction. + */ +void workingset_refault(struct folio *folio, void *shadow) +{ + bool file = folio_is_file_lru(folio); + struct pglist_data *pgdat; + struct mem_cgroup *memcg; + struct lruvec *lruvec; + bool workingset; + long nr; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (lru_gen_enabled()) { + lru_gen_refault(folio, shadow); + return; + } + + /* + * The activation decision for this folio is made at the level + * where the eviction occurred, as that is where the LRU order + * during folio reclaim is being determined. + * + * However, the cgroup that will own the folio is the one that + * is actually experiencing the refault event. Make sure the folio is + * locked to guarantee folio_memcg() stability throughout. + */ + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); + pgdat = folio_pgdat(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); + + if (!workingset_test_recent(shadow, file, &workingset, true)) + return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); @@ -385,12 +573,13 @@ void workingset_refault(struct folio *folio, void *shadow) /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); - /* XXX: Move to lru_cache_add() when it supports new vs putback */ - lru_note_cost_folio(folio); + /* + * XXX: Move to folio_add_lru() when it supports new vs + * putback + */ + lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } -out: - rcu_read_unlock(); } /** @@ -399,22 +588,12 @@ out: */ void workingset_activation(struct folio *folio) { - struct mem_cgroup *memcg; - - rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. - * - * XXX: See workingset_refault() - this should return - * root_mem_cgroup even for !CONFIG_MEMCG. */ - memcg = folio_memcg_rcu(folio); - if (!mem_cgroup_disabled() && !memcg) - goto out; - workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); -out: - rcu_read_unlock(); + if (mem_cgroup_disabled() || folio_memcg_charged(folio)) + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); } /* @@ -429,10 +608,12 @@ out: * point where they would still be useful. */ -static struct list_lru shadow_nodes; +struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { + struct page *page = virt_to_page(node); + /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @@ -441,17 +622,17 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ + lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { - list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_kmem_state(node, WORKINGSET_NODES); + list_lru_add_obj(&shadow_nodes, &node->private_list); + __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { - list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + list_lru_del_obj(&shadow_nodes, &node->private_list); + __dec_node_page_state(page, WORKINGSET_NODES); } } } @@ -494,6 +675,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; + mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, @@ -515,8 +697,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, - void *arg) __must_hold(lru_lock) + void *arg) __must_hold(lru->lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; @@ -525,35 +706,38 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the - * lru_lock. Because the page cache tree is emptied before - * the inode can be destroyed, holding the lru_lock pins any + * &lru->lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the &lru->lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want - * to reclaim, take the node off-LRU, and drop the lru_lock. + * to reclaim, take the node off-LRU, and drop the &lru->lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { - spin_unlock_irq(lru_lock); + spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } - if (!spin_trylock(&mapping->host->i_lock)) { - xa_unlock(&mapping->i_pages); - spin_unlock_irq(lru_lock); - ret = LRU_RETRY; - goto out; + /* For page cache we need to hold i_lock */ + if (mapping->host != NULL) { + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(&lru->lock); + ret = LRU_RETRY; + goto out; + } } list_lru_isolate(lru, item); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); - spin_unlock(lru_lock); + spin_unlock(&lru->lock); /* * The nodes should only contain one or more shadow entries, @@ -565,17 +749,18 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); - __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); out_invalid: xa_unlock_irq(&mapping->i_pages); - if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); - spin_unlock(&mapping->host->i_lock); + if (mapping->host != NULL) { + if (mapping_shrinkable(mapping)) + inode_lru_list_add(mapping->host); + spin_unlock(&mapping->host->i_lock); + } ret = LRU_REMOVED_RETRY; out: cond_resched(); - spin_lock_irq(lru_lock); return ret; } @@ -587,13 +772,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, NULL); } -static struct shrinker workingset_shadow_shrinker = { - .count_objects = count_shadow_nodes, - .scan_objects = scan_shadow_nodes, - .seeks = 0, /* ->count reports only fully expendable nodes */ - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, -}; - /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. @@ -602,9 +780,10 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; - int ret; + int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* @@ -621,17 +800,26 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = prealloc_shrinker(&workingset_shadow_shrinker); - if (ret) + workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | + SHRINKER_MEMCG_AWARE, + "mm-shadow"); + if (!workingset_shadow_shrinker) goto err; - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, - &workingset_shadow_shrinker); + + ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, + &shadow_nodes_key); if (ret) goto err_list_lru; - register_shrinker_prepared(&workingset_shadow_shrinker); + + workingset_shadow_shrinker->count_objects = count_shadow_nodes; + workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; + /* ->count reports only fully expendable nodes */ + workingset_shadow_shrinker->seeks = 0; + + shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: - free_prealloced_shrinker(&workingset_shadow_shrinker); + shrinker_free(workingset_shadow_shrinker); err: return ret; } |
