summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorYu Zhao <yuzhao@google.com>2024-12-30 21:35:37 -0700
committerAndrew Morton <akpm@linux-foundation.org>2025-01-25 20:22:39 -0800
commit4d5d14a01e2c9091b128fb46e1d07475e9a7bb72 (patch)
tree2385dd6c0eebf80cdeed4d76104f86b047e32e63 /include/linux
parentb1a71694fb00c9a7bad788a0b49198eab20621b3 (diff)
mm/mglru: rework workingset protection
With the aging feedback no longer considering the distribution of folios in each generation, rework workingset protection to better distribute folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and PG_referenced/LRU_REFS_FLAGS in a slightly different way. For folios accessed multiple times through file descriptors, make lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily promoted into the second oldest generation in the eviction path. And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only valid when PG_referenced is set. For folios accessed multiple times through page tables, folio_update_gen() from a page table walk or lru_gen_set_refs() from a rmap walk sets PG_referenced after the accessed bit is cleared for the first time. Thereafter, those two paths set PG_workingset and promote folios to the youngest generation. Like folio_inc_gen(), when folio_update_gen() does that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not used. For both of the cases, after PG_workingset is set on a folio, it remains until this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It can be set again if lru_gen_test_recent() returns true upon a refault. When adding folios to the LRU lists, lru_gen_folio_seq() distributes them as follows: +---------------------------------+---------------------------------+ | Accessed thru page tables | Accessed thru file descriptors | +---------------------------------+---------------------------------+ | PG_active (set while isolated) | | +----------------+----------------+----------------+----------------+ | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | +---------------------------------+---------------------------------+ |<--------- MIN_NR_GENS --------->| | |<-------------------------- MAX_NR_GENS -------------------------->| After this patch, some typical client and server workloads showed improvements under heavy memory pressure. For example, Python TPC-C, which was used to benchmark a different approach [1] to better detect refault distances, showed a significant decrease in total refaults: Before After Change Time (seconds) 10801 10801 0% Executed (transactions) 41472 43663 +5% workingset_nodes 109070 120244 +10% workingset_refault_anon 5019627 7281831 +45% workingset_refault_file 1294678786 554855564 -57% workingset_refault_total 1299698413 562137395 -57% [1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/ Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> Reported-by: Kairui Song <kasong@tencent.com> Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ Tested-by: Kalesh Singh <kaleshsingh@google.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Bharata B Rao <bharata@amd.com> Cc: David Stevens <stevensd@chromium.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/mm_inline.h88
-rw-r--r--include/linux/mmzone.h82
2 files changed, 94 insertions, 76 deletions
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 34e5097182a0..f9157a0c42a5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq)
return seq % NR_HIST_GENS;
}
-static inline int lru_tier_from_refs(int refs)
+static inline int lru_tier_from_refs(int refs, bool workingset)
{
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
- /* see the comment in folio_lru_refs() */
- return order_base_2(refs + 1);
+ /* see the comment on MAX_NR_TIERS */
+ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
}
static inline int folio_lru_refs(struct folio *folio)
{
unsigned long flags = READ_ONCE(folio->flags);
- bool workingset = flags & BIT(PG_workingset);
+ if (!(flags & BIT(PG_referenced)))
+ return 0;
/*
- * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
- * total number of accesses is N>1, since N=0,1 both map to the first
- * tier. lru_tier_from_refs() will account for this off-by-one. Also see
- * the comment on MAX_NR_TIERS.
+ * Return the total number of accesses including PG_referenced. Also see
+ * the comment on LRU_REFS_FLAGS.
*/
- return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
-}
-
-static inline void folio_clear_lru_refs(struct folio *folio)
-{
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
}
static inline int folio_lru_gen(struct folio *folio)
@@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
+static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio,
+ bool reclaiming)
+{
+ int gen;
+ int type = folio_is_file_lru(folio);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ /*
+ * +-----------------------------------+-----------------------------------+
+ * | Accessed through page tables and | Accessed through file descriptors |
+ * | promoted by folio_update_gen() | and protected by folio_inc_gen() |
+ * +-----------------------------------+-----------------------------------+
+ * | PG_active (set while isolated) | |
+ * +-----------------+-----------------+-----------------+-----------------+
+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS |
+ * +-----------------------------------+-----------------------------------+
+ * |<---------- MIN_NR_GENS ---------->| |
+ * |<---------------------------- MAX_NR_GENS ---------------------------->|
+ */
+ if (folio_test_active(folio))
+ gen = MIN_NR_GENS - folio_test_workingset(folio);
+ else if (reclaiming)
+ gen = MAX_NR_GENS;
+ else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ gen = MIN_NR_GENS;
+ else
+ gen = MAX_NR_GENS - folio_test_workingset(folio);
+
+ return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
+}
+
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
unsigned long seq;
unsigned long flags;
- unsigned long mask;
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
@@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
- /*
- * There are four common cases for this page:
- * 1. If it's hot, i.e., freshly faulted in, add it to the youngest
- * generation, and it's protected over the rest below.
- * 2. If it can't be evicted immediately, i.e., a dirty page pending
- * writeback, add it to the second youngest generation.
- * 3. If it should be evicted first, e.g., cold and clean from
- * folio_rotate_reclaimable(), add it to the oldest generation.
- * 4. Everything else falls between 2 & 3 above and is added to the
- * second oldest generation if it's considered inactive, or the
- * oldest generation otherwise. See lru_gen_is_active().
- */
- if (folio_test_active(folio))
- seq = lrugen->max_seq;
- else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
- (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))))
- seq = lrugen->max_seq - 1;
- else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
- seq = lrugen->min_seq[type];
- else
- seq = lrugen->min_seq[type] + 1;
+ seq = lru_gen_folio_seq(lruvec, folio, reclaiming);
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
/* see the comment on MIN_NR_GENS about PG_active */
- mask = LRU_GEN_MASK;
- /*
- * Don't clear PG_workingset here because it can affect PSI accounting
- * if the activation is due to workingset refault.
- */
- if (folio_test_active(folio))
- mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active);
- set_mask_bits(&folio->flags, mask, flags);
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8245ecb0400b..9540b41894da 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -332,66 +332,88 @@ enum lruvec_flags {
#endif /* !__GENERATING_BOUNDS_H */
/*
- * Evictable pages are divided into multiple generations. The youngest and the
+ * Evictable folios are divided into multiple generations. The youngest and the
* oldest generation numbers, max_seq and min_seq, are monotonically increasing.
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
* corresponding generation. The gen counter in folio->flags stores gen+1 while
- * a page is on one of lrugen->folios[]. Otherwise it stores 0.
+ * a folio is on one of lrugen->folios[]. Otherwise it stores 0.
*
- * A page is added to the youngest generation on faulting. The aging needs to
- * check the accessed bit at least twice before handing this page over to the
- * eviction. The first check takes care of the accessed bit set on the initial
- * fault; the second check makes sure this page hasn't been used since then.
- * This process, AKA second chance, requires a minimum of two generations,
- * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
- * LRU, e.g., /proc/vmstat, these two generations are considered active; the
- * rest of generations, if they exist, are considered inactive. See
- * lru_gen_is_active().
+ * After a folio is faulted in, the aging needs to check the accessed bit at
+ * least twice before handing this folio over to the eviction. The first check
+ * clears the accessed bit from the initial fault; the second check makes sure
+ * this folio hasn't been used since then. This process, AKA second chance,
+ * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI
+ * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two
+ * generations are considered active; the rest of generations, if they exist,
+ * are considered inactive. See lru_gen_is_active().
*
- * PG_active is always cleared while a page is on one of lrugen->folios[] so
- * that the aging needs not to worry about it. And it's set again when a page
- * considered active is isolated for non-reclaiming purposes, e.g., migration.
- * See lru_gen_add_folio() and lru_gen_del_folio().
+ * PG_active is always cleared while a folio is on one of lrugen->folios[] so
+ * that the sliding window needs not to worry about it. And it's set again when
+ * a folio considered active is isolated for non-reclaiming purposes, e.g.,
+ * migration. See lru_gen_add_folio() and lru_gen_del_folio().
*
* MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
* number of categories of the active/inactive LRU when keeping track of
* accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
- * in folio->flags.
+ * in folio->flags, masked by LRU_GEN_MASK.
*/
#define MIN_NR_GENS 2U
#define MAX_NR_GENS 4U
/*
- * Each generation is divided into multiple tiers. A page accessed N times
- * through file descriptors is in tier order_base_2(N). A page in the first tier
- * (N=0,1) is marked by PG_referenced unless it was faulted in through page
- * tables or read ahead. A page in any other tier (N>1) is marked by
- * PG_referenced and PG_workingset. This implies a minimum of two tiers is
- * supported without using additional bits in folio->flags.
+ * Each generation is divided into multiple tiers. A folio accessed N times
+ * through file descriptors is in tier order_base_2(N). A folio in the first
+ * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page
+ * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by
+ * PG_workingset. A folio in any other tier (1<N<5) between the first and last
+ * is marked by additional bits of LRU_REFS_WIDTH in folio->flags.
*
* In contrast to moving across generations which requires the LRU lock, moving
* across tiers only involves atomic operations on folio->flags and therefore
* has a negligible cost in the buffered access path. In the eviction path,
- * comparisons of refaulted/(evicted+protected) from the first tier and the
- * rest infer whether pages accessed multiple times through file descriptors
- * are statistically hot and thus worth protecting.
+ * comparisons of refaulted/(evicted+protected) from the first tier and the rest
+ * infer whether folios accessed multiple times through file descriptors are
+ * statistically hot and thus worth protecting.
*
* MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
* number of categories of the active/inactive LRU when keeping track of
* accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
- * folio->flags.
+ * folio->flags, masked by LRU_REFS_MASK.
*/
#define MAX_NR_TIERS 4U
#ifndef __GENERATING_BOUNDS_H
-struct lruvec;
-struct page_vma_mapped_walk;
-
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+/*
+ * For folios accessed multiple times through file descriptors,
+ * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags
+ * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its
+ * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily
+ * promoted into the second oldest generation in the eviction path. And when
+ * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
+ * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is
+ * only valid when PG_referenced is set.
+ *
+ * For folios accessed multiple times through page tables, folio_update_gen()
+ * from a page table walk or lru_gen_set_refs() from a rmap walk sets
+ * PG_referenced after the accessed bit is cleared for the first time.
+ * Thereafter, those two paths set PG_workingset and promote folios to the
+ * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears
+ * PG_referenced. Note that for this case, LRU_REFS_MASK is not used.
+ *
+ * For both cases above, after PG_workingset is set on a folio, it remains until
+ * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It
+ * can be set again if lru_gen_test_recent() returns true upon a refault.
+ */
+#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced))
+
+struct lruvec;
+struct page_vma_mapped_walk;
+
#ifdef CONFIG_LRU_GEN
enum {
@@ -406,8 +428,6 @@ enum {
NR_LRU_GEN_CAPS
};
-#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
-
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)