summaryrefslogtreecommitdiff
path: root/mm/workingset.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/workingset.c')
-rw-r--r--mm/workingset.c335
1 files changed, 205 insertions, 130 deletions
diff --git a/mm/workingset.c b/mm/workingset.c
index 1a86645b7b3c..e9f05634747a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -16,6 +16,7 @@
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include "internal.h"
/*
* Double CLOCK lists
@@ -111,9 +112,20 @@
*
* NR_inactive + (R - E) <= NR_inactive + NR_active
*
- * which can be further simplified to
+ * If we have swap we should consider about NR_inactive_anon and
+ * NR_active_anon, so for page cache and anonymous respectively:
*
- * (R - E) <= NR_active
+ * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
+ * + NR_inactive_anon + NR_active_anon
+ *
+ * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
+ * + NR_inactive_file + NR_active_file
+ *
+ * Which can be further simplified to:
+ *
+ * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
+ *
+ * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
*
* Put into words, the refault distance (out-of-cache) can be seen as
* a deficit in inactive list space (in-cache). If the inactive list
@@ -130,14 +142,14 @@
* are no longer in active use.
*
* So when a refault distance of (R - E) is observed and there are at
- * least (R - E) active pages, the refaulting page is activated
- * optimistically in the hope that (R - E) active pages are actually
+ * least (R - E) pages in the userspace workingset, the refaulting page
+ * is activated optimistically in the hope that (R - E) pages are actually
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
* That means if inactive cache is refaulting with a suitable refault
* distance, we assume the cache workingset is transitioning and put
- * pressure on the current active list.
+ * pressure on the current workingset.
*
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
@@ -223,11 +235,12 @@ static void *lru_gen_eviction(struct folio *folio)
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
+ struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
@@ -241,60 +254,71 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
+}
+
+/*
+ * Tests if the shadow entry is for a folio that was recently evicted.
+ * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
+ */
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ int memcg_id;
+ unsigned long max_seq;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
+
+ memcg = mem_cgroup_from_id(memcg_id);
+ *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
+ max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
+
+ return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
{
+ bool recent;
int hist, tier, refs;
- int memcg_id;
bool workingset;
unsigned long token;
- unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
- struct mem_cgroup *memcg;
- struct pglist_data *pgdat;
+ struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
- unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
-
- if (pgdat != folio_pgdat(folio))
- return;
-
rcu_read_lock();
- memcg = folio_memcg_rcu(folio);
- if (memcg_id != mem_cgroup_id(memcg))
+ recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
+ if (lruvec != folio_lruvec(folio))
goto unlock;
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- lrugen = &lruvec->lrugen;
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
- min_seq = READ_ONCE(lrugen->min_seq[type]);
- if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
+ if (!recent)
goto unlock;
- hist = lru_hist_from_seq(min_seq);
- /* see the comment in folio_lru_refs() */
- refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
- tier = lru_tier_from_refs(refs);
+ lrugen = &lruvec->lrugen;
+
+ hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
+ tier = lru_tier_from_refs(refs, workingset);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
- /*
- * Count the following two cases as stalls:
- * 1. For pages accessed through page tables, hotter pages pushed out
- * hot pages which refaulted immediately.
- * 2. For pages accessed multiple times through file descriptors,
- * numbers of accesses might have been out of the range.
- */
- if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
+ /* see folio_add_lru() where folio_set_active() will be called */
+ if (lru_gen_in_fault())
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+
+ if (workingset) {
folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
- }
+ } else
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
@@ -306,6 +330,12 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ return false;
+}
+
static void lru_gen_refault(struct folio *folio, void *shadow)
{
}
@@ -374,39 +404,42 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
}
/**
- * workingset_refault - Evaluate the refault of a previously evicted folio.
- * @folio: The freshly allocated replacement folio.
- * @shadow: Shadow entry of the evicted folio.
- *
- * Calculates and evaluates the refault distance of the previously
- * evicted folio in the context of the node and the memcg whose memory
- * pressure caused the eviction.
+ * workingset_test_recent - tests if the shadow entry is for a folio that was
+ * recently evicted. Also fills in @workingset with the value unpacked from
+ * shadow.
+ * @shadow: the shadow entry to be tested.
+ * @file: whether the corresponding folio is from the file lru.
+ * @workingset: where the workingset value unpacked from shadow should
+ * be stored.
+ * @flush: whether to flush cgroup rstat.
+ *
+ * Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-void workingset_refault(struct folio *folio, void *shadow)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset,
+ bool flush)
{
- bool file = folio_is_file_lru(folio);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
unsigned long workingset_size;
- struct pglist_data *pgdat;
- struct mem_cgroup *memcg;
- unsigned long eviction;
- struct lruvec *lruvec;
unsigned long refault;
- bool workingset;
int memcgid;
- long nr;
+ struct pglist_data *pgdat;
+ unsigned long eviction;
if (lru_gen_enabled()) {
- lru_gen_refault(folio, shadow);
- return;
+ bool recent;
+
+ rcu_read_lock();
+ recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
+ rcu_read_unlock();
+ return recent;
}
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ rcu_read_lock();
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
- rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
@@ -424,8 +457,25 @@ void workingset_refault(struct folio *folio, void *shadow)
* configurations instead.
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_tryget(eviction_memcg))
+ eviction_memcg = NULL;
+ rcu_read_unlock();
+
if (!mem_cgroup_disabled() && !eviction_memcg)
- goto out;
+ return false;
+ /*
+ * Flush stats (and potentially sleep) outside the RCU read section.
+ *
+ * Note that workingset_test_recent() itself might be called in RCU read
+ * section (for e.g, in cachestat) - these callers need to skip flushing
+ * stats (via the flush argument).
+ *
+ * XXX: With per-memcg flushing and thresholding, is ratelimiting
+ * still needed here?
+ */
+ if (flush)
+ mem_cgroup_flush_stats_ratelimited(eviction_memcg);
+
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -448,33 +498,18 @@ void workingset_refault(struct folio *folio, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
- * The activation decision for this folio is made at the level
- * where the eviction occurred, as that is where the LRU order
- * during folio reclaim is being determined.
- *
- * However, the cgroup that will own the folio is the one that
- * is actually experiencing the refault event.
- */
- nr = folio_nr_pages(folio);
- memcg = folio_memcg(folio);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
-
- mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
-
- mem_cgroup_flush_stats_delayed();
- /*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
* all the memory was available to the workingset. Whether
* workingset competition needs to consider anon or not depends
- * on having swap.
+ * on having free swap space.
*/
workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
if (!file) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_FILE);
}
- if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
if (file) {
@@ -482,8 +517,54 @@ void workingset_refault(struct folio *folio, void *shadow)
NR_INACTIVE_ANON);
}
}
- if (refault_distance > workingset_size)
- goto out;
+
+ mem_cgroup_put(eviction_memcg);
+ return refault_distance <= workingset_size;
+}
+
+/**
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted folio in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
+ */
+void workingset_refault(struct folio *folio, void *shadow)
+{
+ bool file = folio_is_file_lru(folio);
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ bool workingset;
+ long nr;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
+ /*
+ * The activation decision for this folio is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during folio reclaim is being determined.
+ *
+ * However, the cgroup that will own the folio is the one that
+ * is actually experiencing the refault event. Make sure the folio is
+ * locked to guarantee folio_memcg() stability throughout.
+ */
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+
+ if (!workingset_test_recent(shadow, file, &workingset, true))
+ return;
folio_set_active(folio);
workingset_age_nonresident(lruvec, nr);
@@ -499,8 +580,6 @@ void workingset_refault(struct folio *folio, void *shadow)
lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
-out:
- rcu_read_unlock();
}
/**
@@ -509,22 +588,12 @@ out:
*/
void workingset_activation(struct folio *folio)
{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
- *
- * XXX: See workingset_refault() - this should return
- * root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = folio_memcg_rcu(folio);
- if (!mem_cgroup_disabled() && !memcg)
- goto out;
- workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
-out:
- rcu_read_unlock();
+ if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+ workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
}
/*
@@ -543,7 +612,7 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
- struct address_space *mapping;
+ struct page *page = virt_to_page(node);
/*
* Track non-empty nodes that contain only shadow entries;
@@ -553,18 +622,17 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- mapping = container_of(node->array, struct address_space, i_pages);
- lockdep_assert_held(&mapping->i_pages.xa_lock);
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
- list_lru_add(&shadow_nodes, &node->private_list);
- __inc_lruvec_kmem_state(node, WORKINGSET_NODES);
+ list_lru_add_obj(&shadow_nodes, &node->private_list);
+ __inc_node_page_state(page, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
- list_lru_del(&shadow_nodes, &node->private_list);
- __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
+ list_lru_del_obj(&shadow_nodes, &node->private_list);
+ __dec_node_page_state(page, WORKINGSET_NODES);
}
}
}
@@ -607,6 +675,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;
+ mem_cgroup_flush_stats_ratelimited(sc->memcg);
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
@@ -628,8 +697,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
- void *arg) __must_hold(lru_lock)
+ void *arg) __must_hold(lru->lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
struct address_space *mapping;
@@ -638,35 +706,38 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/*
* Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
- * lru_lock. Because the page cache tree is emptied before
- * the inode can be destroyed, holding the lru_lock pins any
+ * &lru->lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the &lru->lock pins any
* address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
- * to reclaim, take the node off-LRU, and drop the lru_lock.
+ * to reclaim, take the node off-LRU, and drop the &lru->lock.
*/
mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
- if (!spin_trylock(&mapping->host->i_lock)) {
- xa_unlock(&mapping->i_pages);
- spin_unlock_irq(lru_lock);
- ret = LRU_RETRY;
- goto out;
+ /* For page cache we need to hold i_lock */
+ if (mapping->host != NULL) {
+ if (!spin_trylock(&mapping->host->i_lock)) {
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irq(&lru->lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
}
list_lru_isolate(lru, item);
- __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
+ __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
- spin_unlock(lru_lock);
+ spin_unlock(&lru->lock);
/*
* The nodes should only contain one or more shadow entries,
@@ -678,17 +749,18 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
xa_delete_node(node, workingset_update_node);
- __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
+ mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
- if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
- spin_unlock(&mapping->host->i_lock);
+ if (mapping->host != NULL) {
+ if (mapping_shrinkable(mapping))
+ inode_lru_list_add(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
- spin_lock_irq(lru_lock);
return ret;
}
@@ -700,13 +772,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
NULL);
}
-static struct shrinker workingset_shadow_shrinker = {
- .count_objects = count_shadow_nodes,
- .scan_objects = scan_shadow_nodes,
- .seeks = 0, /* ->count reports only fully expendable nodes */
- .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
-};
-
/*
* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
* i_pages lock.
@@ -715,9 +780,10 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ struct shrinker *workingset_shadow_shrinker;
unsigned int timestamp_bits;
unsigned int max_order;
- int ret;
+ int ret = -ENOMEM;
BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
/*
@@ -734,17 +800,26 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
- if (ret)
+ workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
+ SHRINKER_MEMCG_AWARE,
+ "mm-shadow");
+ if (!workingset_shadow_shrinker)
goto err;
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
- &workingset_shadow_shrinker);
+
+ ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
+ &shadow_nodes_key);
if (ret)
goto err_list_lru;
- register_shrinker_prepared(&workingset_shadow_shrinker);
+
+ workingset_shadow_shrinker->count_objects = count_shadow_nodes;
+ workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
+ /* ->count reports only fully expendable nodes */
+ workingset_shadow_shrinker->seeks = 0;
+
+ shrinker_register(workingset_shadow_shrinker);
return 0;
err_list_lru:
- free_prealloced_shrinker(&workingset_shadow_shrinker);
+ shrinker_free(workingset_shadow_shrinker);
err:
return ret;
}