summaryrefslogtreecommitdiff
path: root/mm/workingset.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/workingset.c')
-rw-r--r--mm/workingset.c532
1 files changed, 393 insertions, 139 deletions
diff --git a/mm/workingset.c b/mm/workingset.c
index dcb994f2acc2..e9f05634747a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -6,6 +6,7 @@
*/
#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
@@ -15,6 +16,7 @@
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include "internal.h"
/*
* Double CLOCK lists
@@ -110,9 +112,20 @@
*
* NR_inactive + (R - E) <= NR_inactive + NR_active
*
- * which can be further simplified to
+ * If we have swap we should consider about NR_inactive_anon and
+ * NR_active_anon, so for page cache and anonymous respectively:
*
- * (R - E) <= NR_active
+ * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
+ * + NR_inactive_anon + NR_active_anon
+ *
+ * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
+ * + NR_inactive_file + NR_active_file
+ *
+ * Which can be further simplified to:
+ *
+ * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
+ *
+ * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
*
* Put into words, the refault distance (out-of-cache) can be seen as
* a deficit in inactive list space (in-cache). If the inactive list
@@ -129,14 +142,14 @@
* are no longer in active use.
*
* So when a refault distance of (R - E) is observed and there are at
- * least (R - E) active pages, the refaulting page is activated
- * optimistically in the hope that (R - E) active pages are actually
+ * least (R - E) pages in the userspace workingset, the refaulting page
+ * is activated optimistically in the hope that (R - E) pages are actually
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
* That means if inactive cache is refaulting with a suitable refault
* distance, we assume the cache workingset is transitioning and put
- * pressure on the current active list.
+ * pressure on the current workingset.
*
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
@@ -156,8 +169,8 @@
*
* Implementation
*
- * For each node's file LRU lists, a counter for inactive evictions
- * and activations is maintained (node->inactive_age).
+ * For each node's LRU lists, a counter for inactive evictions and
+ * activations is maintained (node->nonresident_age).
*
* On eviction, a snapshot of this counter (along with some bits to
* identify the node) is stored in the now empty page cache
@@ -167,8 +180,10 @@
* refault distance will immediately activate the refaulting page.
*/
+#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+ WORKINGSET_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
@@ -184,11 +199,10 @@ static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
- eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- eviction = (eviction << 1) | workingset;
+ eviction = (eviction << WORKINGSET_SHIFT) | workingset;
return xa_mk_value(eviction);
}
@@ -200,8 +214,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
int memcgid, nid;
bool workingset;
- workingset = entry & 1;
- entry >>= 1;
+ workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
+ entry >>= WORKINGSET_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -209,65 +223,229 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
- *evictionp = entry << bucket_order;
+ *evictionp = entry;
*workingsetp = workingset;
}
+#ifdef CONFIG_LRU_GEN
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ int hist;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+ int refs = folio_lru_refs(folio);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->lrugen;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
+
+ hist = lru_hist_from_seq(min_seq);
+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
+}
+
+/*
+ * Tests if the shadow entry is for a folio that was recently evicted.
+ * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
+ */
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ int memcg_id;
+ unsigned long max_seq;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
+
+ memcg = mem_cgroup_from_id(memcg_id);
+ *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
+ max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
+
+ return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+ bool recent;
+ int hist, tier, refs;
+ bool workingset;
+ unsigned long token;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+
+ rcu_read_lock();
+
+ recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
+ if (lruvec != folio_lruvec(folio))
+ goto unlock;
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
+
+ if (!recent)
+ goto unlock;
+
+ lrugen = &lruvec->lrugen;
+
+ hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
+ tier = lru_tier_from_refs(refs, workingset);
+
+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
+
+ /* see folio_add_lru() where folio_set_active() will be called */
+ if (lru_gen_in_fault())
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+
+ if (workingset) {
+ folio_set_workingset(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
+ } else
+ set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ return NULL;
+}
+
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ return false;
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
+/**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @lruvec: the lruvec that was aged
+ * @nr_pages: the number of pages to count
+ *
+ * As in-memory pages are aged, non-resident pages need to be aged as
+ * well, in order for the refault distances later on to be comparable
+ * to the in-memory dimensions. This function allows reclaim and LRU
+ * operations to drive the non-resident aging along in parallel.
+ */
+void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
+{
+ /*
+ * Reclaiming a cgroup means reclaiming all its children in a
+ * round-robin fashion. That means that each cgroup has an LRU
+ * order that is composed of the LRU orders of its child
+ * cgroups; and every page has an LRU position not just in the
+ * cgroup that owns it, but in all of that group's ancestors.
+ *
+ * So when the physical inactive list of a leaf cgroup ages,
+ * the virtual inactive lists of all its parents, including
+ * the root cgroup's, age as well.
+ */
+ do {
+ atomic_long_add(nr_pages, &lruvec->nonresident_age);
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
+
/**
- * workingset_eviction - note the eviction of a page from memory
- * @mapping: address space the page was backing
- * @page: the page being evicted
+ * workingset_eviction - note the eviction of a folio from memory
+ * @target_memcg: the cgroup that is causing the reclaim
+ * @folio: the folio being evicted
*
- * Returns a shadow entry to be stored in @mapping->i_pages in place
- * of the evicted @page so that a later refault can be detected.
+ * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
+ * of the evicted @folio so that a later refault can be detected.
*/
-void *workingset_eviction(struct address_space *mapping, struct page *page)
+void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
{
- struct pglist_data *pgdat = page_pgdat(page);
- struct mem_cgroup *memcg = page_memcg(page);
- int memcgid = mem_cgroup_id(memcg);
+ struct pglist_data *pgdat = folio_pgdat(folio);
unsigned long eviction;
struct lruvec *lruvec;
+ int memcgid;
+
+ /* Folio is fully exclusive and pins folio's memory cgroup pointer */
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- /* Page is fully exclusive and pins page->mem_cgroup */
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (lru_gen_enabled())
+ return lru_gen_eviction(folio);
- lruvec = mem_cgroup_lruvec(pgdat, memcg);
- eviction = atomic_long_inc_return(&lruvec->inactive_age);
- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ /* XXX: target_memcg can be NULL, go through lruvec */
+ memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ eviction = atomic_long_read(&lruvec->nonresident_age);
+ eviction >>= bucket_order;
+ workingset_age_nonresident(lruvec, folio_nr_pages(folio));
+ return pack_shadow(memcgid, pgdat, eviction,
+ folio_test_workingset(folio));
}
/**
- * workingset_refault - evaluate the refault of a previously evicted page
- * @page: the freshly allocated replacement page
- * @shadow: shadow entry of the evicted page
- *
- * Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the node it was allocated in.
+ * workingset_test_recent - tests if the shadow entry is for a folio that was
+ * recently evicted. Also fills in @workingset with the value unpacked from
+ * shadow.
+ * @shadow: the shadow entry to be tested.
+ * @file: whether the corresponding folio is from the file lru.
+ * @workingset: where the workingset value unpacked from shadow should
+ * be stored.
+ * @flush: whether to flush cgroup rstat.
+ *
+ * Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-void workingset_refault(struct page *page, void *shadow)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset,
+ bool flush)
{
+ struct mem_cgroup *eviction_memcg;
+ struct lruvec *eviction_lruvec;
unsigned long refault_distance;
- struct pglist_data *pgdat;
- unsigned long active_file;
- struct mem_cgroup *memcg;
- unsigned long eviction;
- struct lruvec *lruvec;
+ unsigned long workingset_size;
unsigned long refault;
- bool workingset;
int memcgid;
+ struct pglist_data *pgdat;
+ unsigned long eviction;
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ if (lru_gen_enabled()) {
+ bool recent;
+
+ rcu_read_lock();
+ recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
+ rcu_read_unlock();
+ return recent;
+ }
rcu_read_lock();
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
+ eviction <<= bucket_order;
+
/*
* Look up the memcg associated with the stored ID. It might
- * have been deleted since the page's eviction.
+ * have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
- * for a new cgroup that refaults a shared page. This is
+ * for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
@@ -278,78 +456,144 @@ void workingset_refault(struct page *page, void *shadow)
* would be better if the root_mem_cgroup existed in all
* configurations instead.
*/
- memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() && !memcg)
- goto out;
- lruvec = mem_cgroup_lruvec(pgdat, memcg);
- refault = atomic_long_read(&lruvec->inactive_age);
- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
+ eviction_memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_tryget(eviction_memcg))
+ eviction_memcg = NULL;
+ rcu_read_unlock();
+
+ if (!mem_cgroup_disabled() && !eviction_memcg)
+ return false;
+ /*
+ * Flush stats (and potentially sleep) outside the RCU read section.
+ *
+ * Note that workingset_test_recent() itself might be called in RCU read
+ * section (for e.g, in cachestat) - these callers need to skip flushing
+ * stats (via the flush argument).
+ *
+ * XXX: With per-memcg flushing and thresholding, is ratelimiting
+ * still needed here?
+ */
+ if (flush)
+ mem_cgroup_flush_stats_ratelimited(eviction_memcg);
+
+ eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
+ refault = atomic_long_read(&eviction_lruvec->nonresident_age);
/*
* Calculate the refault distance
*
* The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases. There is a
+ * across nonresident_age overflows in most cases. There is a
* special case: usually, shadow entries have a short lifetime
* and are either refaulted or reclaimed along with the inode
* before they get too old. But it is not impossible for the
- * inactive_age to lap a shadow entry in the field, which can
- * then result in a false small refault distance, leading to a
- * false activation should this old entry actually refault
- * again. However, earlier kernels used to deactivate
+ * nonresident_age to lap a shadow entry in the field, which
+ * can then result in a false small refault distance, leading
+ * to a false activation should this old entry actually
+ * refault again. However, earlier kernels used to deactivate
* unconditionally with *every* reclaim invocation for the
* longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
- inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
/*
* Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * don't activate pages that couldn't stay resident even if
+ * all the memory was available to the workingset. Whether
+ * workingset competition needs to consider anon or not depends
+ * on having free swap space.
*/
- if (refault_distance > active_file)
- goto out;
+ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
+ if (!file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_FILE);
+ }
+ if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_ACTIVE_ANON);
+ if (file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_ANON);
+ }
+ }
+
+ mem_cgroup_put(eviction_memcg);
+ return refault_distance <= workingset_size;
+}
+
+/**
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted folio in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
+ */
+void workingset_refault(struct folio *folio, void *shadow)
+{
+ bool file = folio_is_file_lru(folio);
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ bool workingset;
+ long nr;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
+ /*
+ * The activation decision for this folio is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during folio reclaim is being determined.
+ *
+ * However, the cgroup that will own the folio is the one that
+ * is actually experiencing the refault event. Make sure the folio is
+ * locked to guarantee folio_memcg() stability throughout.
+ */
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+
+ if (!workingset_test_recent(shadow, file, &workingset, true))
+ return;
- SetPageActive(page);
- atomic_long_inc(&lruvec->inactive_age);
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ folio_set_active(folio);
+ workingset_age_nonresident(lruvec, nr);
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
- /* Page was active prior to eviction */
+ /* Folio was active prior to eviction */
if (workingset) {
- SetPageWorkingset(page);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ folio_set_workingset(folio);
+ /*
+ * XXX: Move to folio_add_lru() when it supports new vs
+ * putback
+ */
+ lru_note_cost_refault(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
-out:
- rcu_read_unlock();
}
/**
* workingset_activation - note a page activation
- * @page: page that is being activated
+ * @folio: Folio that is being activated.
*/
-void workingset_activation(struct page *page)
+void workingset_activation(struct folio *folio)
{
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
- *
- * XXX: See workingset_refault() - this should return
- * root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = page_memcg_rcu(page);
- if (!mem_cgroup_disabled() && !memcg)
- goto out;
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
- atomic_long_inc(&lruvec->inactive_age);
-out:
- rcu_read_unlock();
+ if (mem_cgroup_disabled() || folio_memcg_charged(folio))
+ workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
}
/*
@@ -364,10 +608,12 @@ out:
* point where they would still be useful.
*/
-static struct list_lru shadow_nodes;
+struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
+ struct page *page = virt_to_page(node);
+
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -376,19 +622,17 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+ lockdep_assert_held(&node->array->xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
- list_lru_add(&shadow_nodes, &node->private_list);
- __inc_lruvec_page_state(virt_to_page(node),
- WORKINGSET_NODES);
+ list_lru_add_obj(&shadow_nodes, &node->private_list);
+ __inc_node_page_state(page, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
- list_lru_del(&shadow_nodes, &node->private_list);
- __dec_lruvec_page_state(virt_to_page(node),
- WORKINGSET_NODES);
+ list_lru_del_obj(&shadow_nodes, &node->private_list);
+ __dec_node_page_state(page, WORKINGSET_NODES);
}
}
}
@@ -401,6 +645,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
+ if (!nodes)
+ return SHRINK_EMPTY;
/*
* Approximate a reasonable limit for the nodes
@@ -427,21 +673,23 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
#ifdef CONFIG_MEMCG
if (sc->memcg) {
struct lruvec *lruvec;
-
- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL);
- lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
- pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
- pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
+ int i;
+
+ mem_cgroup_flush_stats_ratelimited(sc->memcg);
+ lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state_local(lruvec,
+ NR_LRU_BASE + i);
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
} else
#endif
pages = node_present_pages(sc->nid);
max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
- if (!nodes)
- return SHRINK_EMPTY;
-
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
@@ -449,39 +697,47 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
- spinlock_t *lru_lock,
- void *arg) __must_hold(lru_lock)
+ void *arg) __must_hold(lru->lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
- XA_STATE(xas, node->array, 0);
struct address_space *mapping;
int ret;
/*
- * Page cache insertions and deletions synchroneously maintain
+ * Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
- * lru_lock. Because the page cache tree is emptied before
- * the inode can be destroyed, holding the lru_lock pins any
+ * &lru->lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the &lru->lock pins any
* address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
- * to reclaim, take the node off-LRU, and drop the lru_lock.
+ * to reclaim, take the node off-LRU, and drop the &lru->lock.
*/
mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock_irq(lru_lock);
+ spin_unlock_irq(&lru->lock);
ret = LRU_RETRY;
goto out;
}
+ /* For page cache we need to hold i_lock */
+ if (mapping->host != NULL) {
+ if (!spin_trylock(&mapping->host->i_lock)) {
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irq(&lru->lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+ }
+
list_lru_isolate(lru, item);
- __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES);
+ __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
- spin_unlock(lru_lock);
+ spin_unlock(&lru->lock);
/*
* The nodes should only contain one or more shadow entries,
@@ -492,24 +748,19 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out_invalid;
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
- mapping->nrexceptional -= node->nr_values;
- xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
- xas.xa_offset = node->offset;
- xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
- xas_set_update(&xas, workingset_update_node);
- /*
- * We could store a shadow entry here which was the minimum of the
- * shadow entries we were tracking ...
- */
- xas_store(&xas, NULL);
- __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+ xa_delete_node(node, workingset_update_node);
+ mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
+ if (mapping->host != NULL) {
+ if (mapping_shrinkable(mapping))
+ inode_lru_list_add(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
- spin_lock_irq(lru_lock);
return ret;
}
@@ -521,13 +772,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
NULL);
}
-static struct shrinker workingset_shadow_shrinker = {
- .count_objects = count_shadow_nodes,
- .scan_objects = scan_shadow_nodes,
- .seeks = 0, /* ->count reports only fully expendable nodes */
- .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
-};
-
/*
* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
* i_pages lock.
@@ -536,9 +780,10 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ struct shrinker *workingset_shadow_shrinker;
unsigned int timestamp_bits;
unsigned int max_order;
- int ret;
+ int ret = -ENOMEM;
BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
/*
@@ -555,17 +800,26 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = prealloc_shrinker(&workingset_shadow_shrinker);
- if (ret)
+ workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
+ SHRINKER_MEMCG_AWARE,
+ "mm-shadow");
+ if (!workingset_shadow_shrinker)
goto err;
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
- &workingset_shadow_shrinker);
+
+ ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
+ &shadow_nodes_key);
if (ret)
goto err_list_lru;
- register_shrinker_prepared(&workingset_shadow_shrinker);
+
+ workingset_shadow_shrinker->count_objects = count_shadow_nodes;
+ workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
+ /* ->count reports only fully expendable nodes */
+ workingset_shadow_shrinker->seeks = 0;
+
+ shrinker_register(workingset_shadow_shrinker);
return 0;
err_list_lru:
- free_prealloced_shrinker(&workingset_shadow_shrinker);
+ shrinker_free(workingset_shadow_shrinker);
err:
return ret;
}