summaryrefslogtreecommitdiff
path: root/include/linux/mm_inline.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm_inline.h')
-rw-r--r--include/linux/mm_inline.h238
1 files changed, 158 insertions, 80 deletions
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index ff3f3f23f649..fa2d6ba811b5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -4,10 +4,11 @@
#include <linux/atomic.h>
#include <linux/huge_mm.h>
+#include <linux/mm_types.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/userfaultfd_k.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
/**
* folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
@@ -24,7 +25,7 @@
* 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
* ram or swap backed folio.
*/
-static inline int folio_is_file_lru(struct folio *folio)
+static inline int folio_is_file_lru(const struct folio *folio)
{
return !folio_test_swapbacked(folio);
}
@@ -43,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
lockdep_assert_held(&lruvec->lru_lock);
WARN_ON_ONCE(nr_pages != (int)nr_pages);
- __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
+ mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
}
@@ -83,7 +84,7 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio)
* Return: The LRU list a folio should be on, as an index
* into the array of LRU lists.
*/
-static __always_inline enum lru_list folio_lru_list(struct folio *folio)
+static __always_inline enum lru_list folio_lru_list(const struct folio *folio)
{
enum lru_list lru;
@@ -132,36 +133,35 @@ static inline int lru_hist_from_seq(unsigned long seq)
return seq % NR_HIST_GENS;
}
-static inline int lru_tier_from_refs(int refs)
+static inline int lru_tier_from_refs(int refs, bool workingset)
{
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
- /* see the comment in folio_lru_refs() */
- return order_base_2(refs + 1);
+ /* see the comment on MAX_NR_TIERS */
+ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
}
-static inline int folio_lru_refs(struct folio *folio)
+static inline int folio_lru_refs(const struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
- bool workingset = flags & BIT(PG_workingset);
+ unsigned long flags = READ_ONCE(folio->flags.f);
+ if (!(flags & BIT(PG_referenced)))
+ return 0;
/*
- * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
- * total number of accesses is N>1, since N=0,1 both map to the first
- * tier. lru_tier_from_refs() will account for this off-by-one. Also see
- * the comment on MAX_NR_TIERS.
+ * Return the total number of accesses including PG_referenced. Also see
+ * the comment on LRU_REFS_FLAGS.
*/
- return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
+ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
}
-static inline int folio_lru_gen(struct folio *folio)
+static inline int folio_lru_gen(const struct folio *folio)
{
- unsigned long flags = READ_ONCE(folio->flags);
+ unsigned long flags = READ_ONCE(folio->flags.f);
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
-static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen)
{
unsigned long max_seq = lruvec->lrugen.max_seq;
@@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
enum lru_list lru = type * LRU_INACTIVE_FILE;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
@@ -217,6 +217,40 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
+static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec,
+ const struct folio *folio,
+ bool reclaiming)
+{
+ int gen;
+ int type = folio_is_file_lru(folio);
+ const struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ /*
+ * +-----------------------------------+-----------------------------------+
+ * | Accessed through page tables and | Accessed through file descriptors |
+ * | promoted by folio_update_gen() | and protected by folio_inc_gen() |
+ * +-----------------------------------+-----------------------------------+
+ * | PG_active (set while isolated) | |
+ * +-----------------+-----------------+-----------------+-----------------+
+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS |
+ * +-----------------------------------+-----------------------------------+
+ * |<---------- MIN_NR_GENS ---------->| |
+ * |<---------------------------- MAX_NR_GENS ---------------------------->|
+ */
+ if (folio_test_active(folio))
+ gen = MIN_NR_GENS - folio_test_workingset(folio);
+ else if (reclaiming)
+ gen = MAX_NR_GENS;
+ else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ gen = MIN_NR_GENS;
+ else
+ gen = MAX_NR_GENS - folio_test_workingset(folio);
+
+ return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
+}
+
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
unsigned long seq;
@@ -224,41 +258,25 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
int gen = folio_lru_gen(folio);
int type = folio_is_file_lru(folio);
int zone = folio_zonenum(folio);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
- /*
- * There are three common cases for this page:
- * 1. If it's hot, e.g., freshly faulted in or previously hot and
- * migrated, add it to the youngest generation.
- * 2. If it's cold but can't be evicted immediately, i.e., an anon page
- * not in swapcache or a dirty page pending writeback, add it to the
- * second oldest generation.
- * 3. Everything else (clean, cold) is added to the oldest generation.
- */
- if (folio_test_active(folio))
- seq = lrugen->max_seq;
- else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
- (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))))
- seq = lrugen->min_seq[type] + 1;
- else
- seq = lrugen->min_seq[type];
+ seq = lru_gen_folio_seq(lruvec, folio, reclaiming);
gen = lru_gen_from_seq(seq);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
/* see the comment on MIN_NR_GENS about PG_active */
- set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+ set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags);
lru_gen_update_size(lruvec, folio, -1, gen);
/* for folio_rotate_reclaimable() */
if (reclaiming)
- list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
else
- list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -276,7 +294,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
/* for folio_migrate_flags() */
flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
- flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags);
gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
lru_gen_update_size(lruvec, folio, gen, -1);
@@ -285,6 +303,12 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return true;
}
+static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
+{
+ unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK;
+
+ set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs);
+}
#else /* !CONFIG_LRU_GEN */
static inline bool lru_gen_enabled(void)
@@ -307,6 +331,10 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio,
return false;
}
+static inline void folio_migrate_refs(struct folio *new, const struct folio *old)
+{
+
+}
#endif /* CONFIG_LRU_GEN */
static __always_inline
@@ -323,12 +351,6 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
list_add(&folio->lru, &lruvec->lists[lru]);
}
-static __always_inline void add_page_to_lru_list(struct page *page,
- struct lruvec *lruvec)
-{
- lruvec_add_folio(lruvec, page_folio(page));
-}
-
static __always_inline
void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
@@ -357,22 +379,7 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
-folio_nr_pages(folio));
}
-static __always_inline void del_page_from_lru_list(struct page *page,
- struct lruvec *lruvec)
-{
- lruvec_del_folio(lruvec, page_folio(page));
-}
-
#ifdef CONFIG_ANON_VMA_NAME
-/*
- * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
- * either keep holding the lock while using the returned pointer or it should
- * raise anon_vma_name refcount before releasing the lock.
- */
-extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
-extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
-extern void anon_vma_name_free(struct kref *kref);
-
/* mmap_lock should be read-locked */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
{
@@ -427,16 +434,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
}
#else /* CONFIG_ANON_VMA_NAME */
-static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
-{
- return NULL;
-}
-
-static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
-{
- return NULL;
-}
-
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
@@ -451,6 +448,8 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
#endif /* CONFIG_ANON_VMA_NAME */
+void pfnmap_track_ctx_release(struct kref *ref);
+
static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
atomic_set(&mm->tlb_flush_pending, 0);
@@ -510,7 +509,7 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm)
atomic_dec(&mm->tlb_flush_pending);
}
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
+static inline bool mm_tlb_flush_pending(const struct mm_struct *mm)
{
/*
* Must be called after having acquired the PTL; orders against that
@@ -523,7 +522,7 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
return atomic_read(&mm->tlb_flush_pending);
}
-static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+static inline bool mm_tlb_flush_nested(const struct mm_struct *mm)
{
/*
* Similar to mm_tlb_flush_pending(), we must have acquired the PTL
@@ -535,6 +534,26 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
return atomic_read(&mm->tlb_flush_pending) > 1;
}
+#ifdef CONFIG_MMU
+/*
+ * Computes the pte marker to copy from the given source entry into dst_vma.
+ * If no marker should be copied, returns 0.
+ * The caller should insert a new pte created with make_pte_marker().
+ */
+static inline pte_marker copy_pte_marker(
+ softleaf_t entry, struct vm_area_struct *dst_vma)
+{
+ const pte_marker srcm = softleaf_to_marker(entry);
+ /* Always copy error entries. */
+ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
+
+ /* Only copy PTE markers if UFFD register matches. */
+ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
+ dstm |= PTE_MARKER_UFFD_WP;
+
+ return dstm;
+}
+
/*
* If this pte is wr-protected by uffd-wp in any form, arm the special pte to
* replace a none pte. NOTE! This should only be called when *pte is already
@@ -545,20 +564,28 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
* Must be called with pgtable lock held so that no thread will see the none
* pte, and if they see it, they'll fault and serialize at the pgtable lock.
*
- * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled.
+ * Returns true if an uffd-wp pte was installed, false otherwise.
*/
-static inline void
+static inline bool
pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
pte_t *pte, pte_t pteval)
{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
bool arm_uffd_pte = false;
+ if (!uffd_supports_wp_marker())
+ return false;
+
/* The current status of the pte should be "cleared" before calling */
- WARN_ON_ONCE(!pte_none(*pte));
+ WARN_ON_ONCE(!pte_none(ptep_get(pte)));
+ /*
+ * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
+ * thing, because when zapping either it means it's dropping the
+ * page, or in TTU where the present pte will be quickly replaced
+ * with a swap pte. There's no way of leaking the bit.
+ */
if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
- return;
+ return false;
/* A uffd-wp wr-protected normal pte */
if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
@@ -571,10 +598,61 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
if (unlikely(pte_swp_uffd_wp_any(pteval)))
arm_uffd_pte = true;
- if (unlikely(arm_uffd_pte))
+ if (unlikely(arm_uffd_pte)) {
set_pte_at(vma->vm_mm, addr, pte,
make_pte_marker(PTE_MARKER_UFFD_WP));
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool vma_has_recency(const struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
+ return false;
+
+ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
+ return false;
+
+ return true;
+}
#endif
+
+/**
+ * num_pages_contiguous() - determine the number of contiguous pages
+ * that represent contiguous PFNs
+ * @pages: an array of page pointers
+ * @nr_pages: length of the array, at least 1
+ *
+ * Determine the number of contiguous pages that represent contiguous PFNs
+ * in @pages, starting from the first page.
+ *
+ * In some kernel configs contiguous PFNs will not have contiguous struct
+ * pages. In these configurations num_pages_contiguous() will return a num
+ * smaller than ideal number. The caller should continue to check for pfn
+ * contiguity after each call to num_pages_contiguous().
+ *
+ * Returns the number of contiguous pages.
+ */
+static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages)
+{
+ struct page *cur_page = pages[0];
+ unsigned long section = memdesc_section(cur_page->flags);
+ size_t i;
+
+ for (i = 1; i < nr_pages; i++) {
+ if (++cur_page != pages[i])
+ break;
+ /*
+ * In unproblematic kernel configs, page_to_section() == 0 and
+ * the whole check will get optimized out.
+ */
+ if (memdesc_section(cur_page->flags) != section)
+ break;
+ }
+
+ return i;
}
#endif