diff options
Diffstat (limited to 'mm/workingset.c')
| -rw-r--r-- | mm/workingset.c | 335 |
1 files changed, 205 insertions, 130 deletions
diff --git a/mm/workingset.c b/mm/workingset.c index 1a86645b7b3c..e9f05634747a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,6 +16,7 @@ #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "internal.h" /* * Double CLOCK lists @@ -111,9 +112,20 @@ * * NR_inactive + (R - E) <= NR_inactive + NR_active * - * which can be further simplified to + * If we have swap we should consider about NR_inactive_anon and + * NR_active_anon, so for page cache and anonymous respectively: * - * (R - E) <= NR_active + * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file + * + NR_inactive_anon + NR_active_anon + * + * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon + * + NR_inactive_file + NR_active_file + * + * Which can be further simplified to: + * + * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon + * + * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list @@ -130,14 +142,14 @@ * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at - * least (R - E) active pages, the refaulting page is activated - * optimistically in the hope that (R - E) active pages are actually + * least (R - E) pages in the userspace workingset, the refaulting page + * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put - * pressure on the current active list. + * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently @@ -223,11 +235,12 @@ static void *lru_gen_eviction(struct folio *folio) unsigned long token; unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; + struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -241,60 +254,71 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); +} + +/* + * Tests if the shadow entry is for a folio that was recently evicted. + * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. + */ +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, + unsigned long *token, bool *workingset) +{ + int memcg_id; + unsigned long max_seq; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + + unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); + + memcg = mem_cgroup_from_id(memcg_id); + *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + + return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) { + bool recent; int hist, tier, refs; - int memcg_id; bool workingset; unsigned long token; - unsigned long min_seq; struct lruvec *lruvec; - struct lru_gen_struct *lrugen; - struct mem_cgroup *memcg; - struct pglist_data *pgdat; + struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); - unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); - - if (pgdat != folio_pgdat(folio)) - return; - rcu_read_lock(); - memcg = folio_memcg_rcu(folio); - if (memcg_id != mem_cgroup_id(memcg)) + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); + if (lruvec != folio_lruvec(folio)) goto unlock; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - lrugen = &lruvec->lrugen; + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); - min_seq = READ_ONCE(lrugen->min_seq[type]); - if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) + if (!recent) goto unlock; - hist = lru_hist_from_seq(min_seq); - /* see the comment in folio_lru_refs() */ - refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; - tier = lru_tier_from_refs(refs); + lrugen = &lruvec->lrugen; + + hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; + tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); - /* - * Count the following two cases as stalls: - * 1. For pages accessed through page tables, hotter pages pushed out - * hot pages which refaulted immediately. - * 2. For pages accessed multiple times through file descriptors, - * numbers of accesses might have been out of the range. - */ - if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { + /* see folio_add_lru() where folio_set_active() will be called */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + + if (workingset) { folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); - } + } else + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } @@ -306,6 +330,12 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, + unsigned long *token, bool *workingset) +{ + return false; +} + static void lru_gen_refault(struct folio *folio, void *shadow) { } @@ -374,39 +404,42 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) } /** - * workingset_refault - Evaluate the refault of a previously evicted folio. - * @folio: The freshly allocated replacement folio. - * @shadow: Shadow entry of the evicted folio. - * - * Calculates and evaluates the refault distance of the previously - * evicted folio in the context of the node and the memcg whose memory - * pressure caused the eviction. + * workingset_test_recent - tests if the shadow entry is for a folio that was + * recently evicted. Also fills in @workingset with the value unpacked from + * shadow. + * @shadow: the shadow entry to be tested. + * @file: whether the corresponding folio is from the file lru. + * @workingset: where the workingset value unpacked from shadow should + * be stored. + * @flush: whether to flush cgroup rstat. + * + * Return: true if the shadow is for a recently evicted folio; false otherwise. */ -void workingset_refault(struct folio *folio, void *shadow) +bool workingset_test_recent(void *shadow, bool file, bool *workingset, + bool flush) { - bool file = folio_is_file_lru(folio); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; - struct pglist_data *pgdat; - struct mem_cgroup *memcg; - unsigned long eviction; - struct lruvec *lruvec; unsigned long refault; - bool workingset; int memcgid; - long nr; + struct pglist_data *pgdat; + unsigned long eviction; if (lru_gen_enabled()) { - lru_gen_refault(folio, shadow); - return; + bool recent; + + rcu_read_lock(); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); + rcu_read_unlock(); + return recent; } - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + rcu_read_lock(); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; - rcu_read_lock(); /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. @@ -424,8 +457,25 @@ void workingset_refault(struct folio *folio, void *shadow) * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_tryget(eviction_memcg)) + eviction_memcg = NULL; + rcu_read_unlock(); + if (!mem_cgroup_disabled() && !eviction_memcg) - goto out; + return false; + /* + * Flush stats (and potentially sleep) outside the RCU read section. + * + * Note that workingset_test_recent() itself might be called in RCU read + * section (for e.g, in cachestat) - these callers need to skip flushing + * stats (via the flush argument). + * + * XXX: With per-memcg flushing and thresholding, is ratelimiting + * still needed here? + */ + if (flush) + mem_cgroup_flush_stats_ratelimited(eviction_memcg); + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -448,33 +498,18 @@ void workingset_refault(struct folio *folio, void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; /* - * The activation decision for this folio is made at the level - * where the eviction occurred, as that is where the LRU order - * during folio reclaim is being determined. - * - * However, the cgroup that will own the folio is the one that - * is actually experiencing the refault event. - */ - nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - - mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - - mem_cgroup_flush_stats_delayed(); - /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends - * on having swap. + * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } - if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { @@ -482,8 +517,54 @@ void workingset_refault(struct folio *folio, void *shadow) NR_INACTIVE_ANON); } } - if (refault_distance > workingset_size) - goto out; + + mem_cgroup_put(eviction_memcg); + return refault_distance <= workingset_size; +} + +/** + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. + * + * Calculates and evaluates the refault distance of the previously + * evicted folio in the context of the node and the memcg whose memory + * pressure caused the eviction. + */ +void workingset_refault(struct folio *folio, void *shadow) +{ + bool file = folio_is_file_lru(folio); + struct pglist_data *pgdat; + struct mem_cgroup *memcg; + struct lruvec *lruvec; + bool workingset; + long nr; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (lru_gen_enabled()) { + lru_gen_refault(folio, shadow); + return; + } + + /* + * The activation decision for this folio is made at the level + * where the eviction occurred, as that is where the LRU order + * during folio reclaim is being determined. + * + * However, the cgroup that will own the folio is the one that + * is actually experiencing the refault event. Make sure the folio is + * locked to guarantee folio_memcg() stability throughout. + */ + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); + pgdat = folio_pgdat(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); + + if (!workingset_test_recent(shadow, file, &workingset, true)) + return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); @@ -499,8 +580,6 @@ void workingset_refault(struct folio *folio, void *shadow) lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } -out: - rcu_read_unlock(); } /** @@ -509,22 +588,12 @@ out: */ void workingset_activation(struct folio *folio) { - struct mem_cgroup *memcg; - - rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. - * - * XXX: See workingset_refault() - this should return - * root_mem_cgroup even for !CONFIG_MEMCG. */ - memcg = folio_memcg_rcu(folio); - if (!mem_cgroup_disabled() && !memcg) - goto out; - workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); -out: - rcu_read_unlock(); + if (mem_cgroup_disabled() || folio_memcg_charged(folio)) + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); } /* @@ -543,7 +612,7 @@ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { - struct address_space *mapping; + struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; @@ -553,18 +622,17 @@ void workingset_update_node(struct xa_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - mapping = container_of(node->array, struct address_space, i_pages); - lockdep_assert_held(&mapping->i_pages.xa_lock); + lockdep_assert_held(&node->array->xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { - list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_kmem_state(node, WORKINGSET_NODES); + list_lru_add_obj(&shadow_nodes, &node->private_list); + __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { - list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + list_lru_del_obj(&shadow_nodes, &node->private_list); + __dec_node_page_state(page, WORKINGSET_NODES); } } } @@ -607,6 +675,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; + mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, @@ -628,8 +697,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, - void *arg) __must_hold(lru_lock) + void *arg) __must_hold(lru->lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; @@ -638,35 +706,38 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the - * lru_lock. Because the page cache tree is emptied before - * the inode can be destroyed, holding the lru_lock pins any + * &lru->lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the &lru->lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want - * to reclaim, take the node off-LRU, and drop the lru_lock. + * to reclaim, take the node off-LRU, and drop the &lru->lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { - spin_unlock_irq(lru_lock); + spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } - if (!spin_trylock(&mapping->host->i_lock)) { - xa_unlock(&mapping->i_pages); - spin_unlock_irq(lru_lock); - ret = LRU_RETRY; - goto out; + /* For page cache we need to hold i_lock */ + if (mapping->host != NULL) { + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(&lru->lock); + ret = LRU_RETRY; + goto out; + } } list_lru_isolate(lru, item); - __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); - spin_unlock(lru_lock); + spin_unlock(&lru->lock); /* * The nodes should only contain one or more shadow entries, @@ -678,17 +749,18 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); - __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); out_invalid: xa_unlock_irq(&mapping->i_pages); - if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); - spin_unlock(&mapping->host->i_lock); + if (mapping->host != NULL) { + if (mapping_shrinkable(mapping)) + inode_lru_list_add(mapping->host); + spin_unlock(&mapping->host->i_lock); + } ret = LRU_REMOVED_RETRY; out: cond_resched(); - spin_lock_irq(lru_lock); return ret; } @@ -700,13 +772,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, NULL); } -static struct shrinker workingset_shadow_shrinker = { - .count_objects = count_shadow_nodes, - .scan_objects = scan_shadow_nodes, - .seeks = 0, /* ->count reports only fully expendable nodes */ - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, -}; - /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. @@ -715,9 +780,10 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; - int ret; + int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* @@ -734,17 +800,26 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); - if (ret) + workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | + SHRINKER_MEMCG_AWARE, + "mm-shadow"); + if (!workingset_shadow_shrinker) goto err; - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, - &workingset_shadow_shrinker); + + ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, + &shadow_nodes_key); if (ret) goto err_list_lru; - register_shrinker_prepared(&workingset_shadow_shrinker); + + workingset_shadow_shrinker->count_objects = count_shadow_nodes; + workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; + /* ->count reports only fully expendable nodes */ + workingset_shadow_shrinker->seeks = 0; + + shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: - free_prealloced_shrinker(&workingset_shadow_shrinker); + shrinker_free(workingset_shadow_shrinker); err: return ret; } |
