summaryrefslogtreecommitdiff
path: root/mm/workingset.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/workingset.c')
-rw-r--r--mm/workingset.c171
1 files changed, 92 insertions, 79 deletions
diff --git a/mm/workingset.c b/mm/workingset.c
index 4686ae363000..e9f05634747a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -16,6 +16,7 @@
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include "internal.h"
/*
* Double CLOCK lists
@@ -238,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio)
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
@@ -252,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
}
/*
* Tests if the shadow entry is for a folio that was recently evicted.
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
*/
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
int memcg_id;
- unsigned long min_seq;
+ unsigned long max_seq;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
@@ -272,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
memcg = mem_cgroup_from_id(memcg_id);
*lruvec = mem_cgroup_lruvec(memcg, pgdat);
- min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
- return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+ max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
+ max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
+
+ return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
@@ -289,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
rcu_read_lock();
- recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
+ recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
if (lruvec != folio_lruvec(folio))
goto unlock;
@@ -301,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
lrugen = &lruvec->lrugen;
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
- /* see the comment in folio_lru_refs() */
- refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
- tier = lru_tier_from_refs(refs);
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
+ tier = lru_tier_from_refs(refs, workingset);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
- /*
- * Count the following two cases as stalls:
- * 1. For pages accessed through page tables, hotter pages pushed out
- * hot pages which refaulted immediately.
- * 2. For pages accessed multiple times through file descriptors,
- * numbers of accesses might have been out of the range.
- */
- if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
+ /* see folio_add_lru() where folio_set_active() will be called */
+ if (lru_gen_in_fault())
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+
+ if (workingset) {
folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
- }
+ } else
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
@@ -330,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
return false;
@@ -411,10 +411,12 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
* @file: whether the corresponding folio is from the file lru.
* @workingset: where the workingset value unpacked from shadow should
* be stored.
+ * @flush: whether to flush cgroup rstat.
*
* Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-bool workingset_test_recent(void *shadow, bool file, bool *workingset)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset,
+ bool flush)
{
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
@@ -425,9 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
struct pglist_data *pgdat;
unsigned long eviction;
- if (lru_gen_enabled())
- return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
+ if (lru_gen_enabled()) {
+ bool recent;
+
+ rcu_read_lock();
+ recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
+ rcu_read_unlock();
+ return recent;
+ }
+ rcu_read_lock();
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
@@ -448,8 +457,24 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
* configurations instead.
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_tryget(eviction_memcg))
+ eviction_memcg = NULL;
+ rcu_read_unlock();
+
if (!mem_cgroup_disabled() && !eviction_memcg)
return false;
+ /*
+ * Flush stats (and potentially sleep) outside the RCU read section.
+ *
+ * Note that workingset_test_recent() itself might be called in RCU read
+ * section (for e.g, in cachestat) - these callers need to skip flushing
+ * stats (via the flush argument).
+ *
+ * XXX: With per-memcg flushing and thresholding, is ratelimiting
+ * still needed here?
+ */
+ if (flush)
+ mem_cgroup_flush_stats_ratelimited(eviction_memcg);
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -493,6 +518,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset)
}
}
+ mem_cgroup_put(eviction_memcg);
return refault_distance <= workingset_size;
}
@@ -514,23 +540,21 @@ void workingset_refault(struct folio *folio, void *shadow)
bool workingset;
long nr;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
if (lru_gen_enabled()) {
lru_gen_refault(folio, shadow);
return;
}
- /* Flush stats (and potentially sleep) before holding RCU read lock */
- mem_cgroup_flush_stats_ratelimited();
-
- rcu_read_lock();
-
/*
* The activation decision for this folio is made at the level
* where the eviction occurred, as that is where the LRU order
* during folio reclaim is being determined.
*
* However, the cgroup that will own the folio is the one that
- * is actually experiencing the refault event.
+ * is actually experiencing the refault event. Make sure the folio is
+ * locked to guarantee folio_memcg() stability throughout.
*/
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
@@ -539,8 +563,8 @@ void workingset_refault(struct folio *folio, void *shadow)
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
- if (!workingset_test_recent(shadow, file, &workingset))
- goto out;
+ if (!workingset_test_recent(shadow, file, &workingset, true))
+ return;
folio_set_active(folio);
workingset_age_nonresident(lruvec, nr);
@@ -556,8 +580,6 @@ void workingset_refault(struct folio *folio, void *shadow)
lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
-out:
- rcu_read_unlock();
}
/**
@@ -566,22 +588,12 @@ out:
*/
void workingset_activation(struct folio *folio)
{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
- *
- * XXX: See workingset_refault() - this should return
- * root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = folio_memcg_rcu(folio);
- if (!mem_cgroup_disabled() && !memcg)
- goto out;
- workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
-out:
- rcu_read_unlock();
+ if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+ workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
}
/*
@@ -600,7 +612,7 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
- struct address_space *mapping;
+ struct page *page = virt_to_page(node);
/*
* Track non-empty nodes that contain only shadow entries;
@@ -610,18 +622,17 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- mapping = container_of(node->array, struct address_space, i_pages);
- lockdep_assert_held(&mapping->i_pages.xa_lock);
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
- list_lru_add(&shadow_nodes, &node->private_list);
- __inc_lruvec_kmem_state(node, WORKINGSET_NODES);
+ list_lru_add_obj(&shadow_nodes, &node->private_list);
+ __inc_node_page_state(page, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
- list_lru_del(&shadow_nodes, &node->private_list);
- __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
+ list_lru_del_obj(&shadow_nodes, &node->private_list);
+ __dec_node_page_state(page, WORKINGSET_NODES);
}
}
}
@@ -664,6 +675,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;
+ mem_cgroup_flush_stats_ratelimited(sc->memcg);
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
@@ -685,8 +697,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
- void *arg) __must_hold(lru_lock)
+ void *arg) __must_hold(lru->lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
struct address_space *mapping;
@@ -695,20 +706,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/*
* Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
- * lru_lock. Because the page cache tree is emptied before
- * the inode can be destroyed, holding the lru_lock pins any
+ * &lru->lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the &lru->lock pins any
* address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
- * to reclaim, take the node off-LRU, and drop the lru_lock.
+ * to reclaim, take the node off-LRU, and drop the &lru->lock.
*/
mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
@@ -717,16 +728,16 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (mapping->host != NULL) {
if (!spin_trylock(&mapping->host->i_lock)) {
xa_unlock(&mapping->i_pages);
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
}
list_lru_isolate(lru, item);
- __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
+ __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
- spin_unlock(lru_lock);
+ spin_unlock(&lru->lock);
/*
* The nodes should only contain one or more shadow entries,
@@ -738,19 +749,18 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
xa_delete_node(node, workingset_update_node);
- __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
+ mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
if (mapping->host != NULL) {
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
- spin_lock_irq(lru_lock);
return ret;
}
@@ -762,13 +772,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
NULL);
}
-static struct shrinker workingset_shadow_shrinker = {
- .count_objects = count_shadow_nodes,
- .scan_objects = scan_shadow_nodes,
- .seeks = 0, /* ->count reports only fully expendable nodes */
- .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
-};
-
/*
* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
* i_pages lock.
@@ -777,9 +780,10 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ struct shrinker *workingset_shadow_shrinker;
unsigned int timestamp_bits;
unsigned int max_order;
- int ret;
+ int ret = -ENOMEM;
BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
/*
@@ -796,17 +800,26 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
- if (ret)
+ workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
+ SHRINKER_MEMCG_AWARE,
+ "mm-shadow");
+ if (!workingset_shadow_shrinker)
goto err;
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
- &workingset_shadow_shrinker);
+
+ ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
+ &shadow_nodes_key);
if (ret)
goto err_list_lru;
- register_shrinker_prepared(&workingset_shadow_shrinker);
+
+ workingset_shadow_shrinker->count_objects = count_shadow_nodes;
+ workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
+ /* ->count reports only fully expendable nodes */
+ workingset_shadow_shrinker->seeks = 0;
+
+ shrinker_register(workingset_shadow_shrinker);
return 0;
err_list_lru:
- free_prealloced_shrinker(&workingset_shadow_shrinker);
+ shrinker_free(workingset_shadow_shrinker);
err:
return ret;
}