summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c2598
1 files changed, 1632 insertions, 966 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9ecf99190ea2..822e05f1a964 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -96,9 +99,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
/*
* On SMP, spin_trylock is sufficient protection.
* On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ * Pass flags to a no-op inline function to typecheck and silence the unused
+ * variable warning.
*/
-#define pcp_trylock_prepare(flags) do { } while (0)
-#define pcp_trylock_finish(flag) do { } while (0)
+static inline void __pcp_trylock_noop(unsigned long *flags) { }
+#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags))
+#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags))
#else
/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
@@ -126,15 +132,6 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
* Generic helper to lookup and a per-cpu variable with an embedded spinlock.
* Return value should be used with equivalent unlock helper.
*/
-#define pcpu_spin_lock(type, member, ptr) \
-({ \
- type *_ret; \
- pcpu_task_pin(); \
- _ret = this_cpu_ptr(ptr); \
- spin_lock(&_ret->member); \
- _ret; \
-})
-
#define pcpu_spin_trylock(type, member, ptr) \
({ \
type *_ret; \
@@ -154,14 +151,21 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
})
/* struct per_cpu_pages specific helpers. */
-#define pcp_spin_lock(ptr) \
- pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
-
-#define pcp_spin_trylock(ptr) \
- pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
+#define pcp_spin_trylock(ptr, UP_flags) \
+({ \
+ struct per_cpu_pages *__ret; \
+ pcp_trylock_prepare(UP_flags); \
+ __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \
+ if (!__ret) \
+ pcp_trylock_finish(UP_flags); \
+ __ret; \
+})
-#define pcp_spin_unlock(ptr) \
- pcpu_spin_unlock(lock, ptr)
+#define pcp_spin_unlock(ptr, UP_flags) \
+({ \
+ pcpu_spin_unlock(lock, ptr); \
+ pcp_trylock_finish(UP_flags); \
+})
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -273,6 +277,7 @@ int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
static int watermark_scale_factor = 10;
+int defrag_mode;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -286,9 +291,8 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static void accept_page(struct page *page, unsigned int order);
-static bool try_to_accept_memory(struct zone *zone, unsigned int order);
-static inline bool has_unaccepted_memory(void);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -322,6 +326,11 @@ static inline bool deferred_pages_enabled(void)
{
return false;
}
+
+static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ return false;
+}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/* Return a pointer to the bitmap storing bits affecting a block of pages */
@@ -345,81 +354,224 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
+static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
+{
+ return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
+}
+
+static __always_inline void
+get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
+ unsigned long **bitmap_word, unsigned long *bitidx)
+{
+ unsigned long *bitmap;
+ unsigned long word_bitidx;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
+#else
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+#endif
+ BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
+ VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+ bitmap = get_pageblock_bitmap(page, pfn);
+ *bitidx = pfn_to_bitidx(page, pfn);
+ word_bitidx = *bitidx / BITS_PER_LONG;
+ *bitidx &= (BITS_PER_LONG - 1);
+ *bitmap_word = &bitmap[word_bitidx];
+}
+
+
/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * __get_pfnblock_flags_mask - Return the requested group of flags for
+ * a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
-unsigned long get_pfnblock_flags_mask(const struct page *page,
- unsigned long pfn, unsigned long mask)
+static unsigned long __get_pfnblock_flags_mask(const struct page *page,
+ unsigned long pfn,
+ unsigned long mask)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
unsigned long word;
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
/*
- * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * This races, without locks, with set_pfnblock_migratetype(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
return (word >> bitidx) & mask;
}
-static __always_inline int get_pfnblock_migratetype(const struct page *page,
- unsigned long pfn)
+/**
+ * get_pfnblock_bit - Check if a standalone bit of a pageblock is set
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to check
+ *
+ * Return: true if the bit is set, otherwise false
+ */
+bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
{
- return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return false;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ return test_bit(bitidx + pb_bit, bitmap_word);
}
/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * get_pfnblock_migratetype - Return the migratetype of a pageblock
* @page: The page within the block of interest
- * @flags: The flags to set
* @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
+ *
+ * Return: The migratetype of the pageblock
+ *
+ * Use get_pfnblock_migratetype() if caller already has both @page and @pfn
+ * to save a call to page_to_pfn().
*/
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long mask)
+__always_inline enum migratetype
+get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
{
- unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
+ unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
+ unsigned long flags;
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
- BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
+ flags = __get_pfnblock_flags_mask(page, pfn, mask);
- bitmap = get_pageblock_bitmap(page, pfn);
- bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (flags & BIT(PB_migrate_isolate))
+ return MIGRATE_ISOLATE;
+#endif
+ return flags & MIGRATETYPE_MASK;
+}
- VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+/**
+ * __set_pfnblock_flags_mask - Set the requested group of flags for
+ * a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @flags: The flags to set
+ * @mask: mask of bits that the caller is interested in
+ */
+static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long flags, unsigned long mask)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+ unsigned long word;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
mask <<= bitidx;
flags <<= bitidx;
- word = READ_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(*bitmap_word);
do {
- } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
+ } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
+}
+
+/**
+ * set_pfnblock_bit - Set a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to set
+ */
+void set_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ set_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * clear_pfnblock_bit - Clear a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to clear
+ */
+void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
+ enum pageblock_bits pb_bit)
+{
+ unsigned long *bitmap_word;
+ unsigned long bitidx;
+
+ if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+ return;
+
+ get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+ clear_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * set_pageblock_migratetype - Set the migratetype of a pageblock
+ * @page: The page within the block of interest
+ * @migratetype: migratetype to set
+ */
+static void set_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype)
+{
+ if (unlikely(page_group_by_mobility_disabled &&
+ migratetype < MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_UNMOVABLE;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(1,
+ "Use set_pageblock_isolate() for pageblock isolation");
+ return;
+ }
+ VM_WARN_ONCE(get_pageblock_isolate(page),
+ "Use clear_pageblock_isolate() to unisolate pageblock");
+ /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page),
+ (unsigned long)migratetype,
+ MIGRATETYPE_AND_ISO_MASK);
}
-void set_pageblock_migratetype(struct page *page, int migratetype)
+void __meminit init_pageblock_migratetype(struct page *page,
+ enum migratetype migratetype,
+ bool isolate)
{
+ unsigned long flags;
+
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
- set_pfnblock_flags_mask(page, (unsigned long)migratetype,
- page_to_pfn(page), MIGRATETYPE_MASK);
+ flags = migratetype;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+ if (migratetype == MIGRATE_ISOLATE) {
+ VM_WARN_ONCE(
+ 1,
+ "Set isolate=true to isolate pageblock with a migratetype");
+ return;
+ }
+ if (isolate)
+ flags |= BIT(PB_migrate_isolate);
+#endif
+ __set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
+ MIGRATETYPE_AND_ISO_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -498,15 +650,16 @@ static void bad_page(struct page *page, const char *reason)
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
- page_mapcount_reset(page); /* remove PageBuddy */
+ if (PageBuddy(page))
+ __ClearPageBuddy(page);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- bool __maybe_unused movable;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool movable;
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
@@ -610,6 +763,10 @@ compaction_capture(struct capture_control *capc, struct page *page,
capc->cc->migratetype != MIGRATE_MOVABLE)
return false;
+ if (migratetype != capc->cc->migratetype)
+ trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
+ capc->cc->migratetype, migratetype);
+
capc->page = page;
return true;
}
@@ -631,6 +788,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
static inline void account_freepages(struct zone *zone, int nr_pages,
int migratetype)
{
+ lockdep_assert_held(&zone->lock);
+
if (is_migrate_isolate(migratetype))
return;
@@ -638,6 +797,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+ else if (migratetype == MIGRATE_HIGHATOMIC)
+ WRITE_ONCE(zone->nr_free_highatomic,
+ zone->nr_free_highatomic + nr_pages);
}
/* Used for pages not on another list */
@@ -646,16 +808,20 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
bool tail)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
else
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
/*
@@ -667,24 +833,34 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int old_mt, int new_mt)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), old_mt, 1 << order);
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), old_mt, nr_pages);
list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
- account_freepages(zone, -(1 << order), old_mt);
- account_freepages(zone, 1 << order, new_mt);
+ account_freepages(zone, -nr_pages, old_mt);
+ account_freepages(zone, nr_pages, new_mt);
+
+ if (order >= pageblock_order &&
+ is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
+ if (!is_migrate_isolate(old_mt))
+ nr_pages = -nr_pages;
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
+ }
}
static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
+ int nr_pages = 1 << order;
+
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
- "page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ "page type is %d, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */
if (page_reported(page))
@@ -694,6 +870,9 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -711,12 +890,12 @@ static inline struct page *get_page_from_free_area(struct free_area *area,
}
/*
- * If this is not the largest possible page, check if the buddy
- * of the next-highest order is free. If it is, it's possible
+ * If this is less than the 2nd largest possible page, check if the buddy
+ * of the next-higher order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
- * as a higher order page
+ * as a 2-level higher order page
*/
static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
@@ -771,7 +950,7 @@ static inline void __free_one_page(struct page *page,
bool to_tail;
VM_BUG_ON(!zone_is_initialized(zone));
- VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
@@ -863,10 +1042,8 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
- (page->flags & check_flags)))
+ page_pool_page_is_pp(page) |
+ (page->flags.f & check_flags)))
return false;
return true;
@@ -882,7 +1059,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & flags)) {
+ if (unlikely(page->flags.f & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
@@ -892,26 +1069,18 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
-static void free_page_is_bad_report(struct page *page)
-{
- bad_page(page,
- page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
-}
-
static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return false;
/* Something has gone sideways, find it */
- free_page_is_bad_report(page);
+ bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
return true;
}
@@ -938,21 +1107,34 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
switch (page - head_page) {
case 1:
/* the first tail page: these may be in place of ->mapping */
- if (unlikely(folio_entire_mapcount(folio))) {
- bad_page(page, "nonzero entire_mapcount");
- goto out;
- }
if (unlikely(folio_large_mapcount(folio))) {
bad_page(page, "nonzero large_mapcount");
goto out;
}
- if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
+ unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
- if (unlikely(atomic_read(&folio->_pincount))) {
- bad_page(page, "nonzero pincount");
- goto out;
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+ bad_page(page, "nonzero mm mapcount 0");
+ goto out;
+ }
+ if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+ bad_page(page, "nonzero mm mapcount 1");
+ goto out;
+ }
+ }
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
}
break;
case 2:
@@ -961,7 +1143,22 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "on deferred list");
goto out;
}
+ if (!IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
+ }
break;
+ case 3:
+ /* the third tail page: hugetlb specifics overlap ->mappings */
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
+ break;
+ fallthrough;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
@@ -1032,6 +1229,79 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_add(page, task, nr);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub(&ref, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_sub(page, nr);
+}
+
+/* When tag is not NULL, assuming mem_alloc_profiling_enabled */
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+{
+ if (tag)
+ this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
__always_inline bool free_pages_prepare(struct page *page,
unsigned int order)
{
@@ -1039,6 +1309,7 @@ __always_inline bool free_pages_prepare(struct page *page,
bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
bool compound = PageCompound(page);
+ struct folio *folio = page_folio(page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1048,11 +1319,32 @@ __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
+ if (unlikely(folio_test_mlocked(folio))) {
+ long nr_pages = folio_nr_pages(folio);
+
+ __folio_clear_mlocked(folio);
+ zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+ }
+
if (unlikely(PageHWPoison(page)) && !order) {
/* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
+
+ /*
+ * The page is isolated and accounted for.
+ * Mark the codetag as empty to avoid accounting error
+ * when the page is freed by unpoison_memory().
+ */
+ clear_page_tag_ref(page);
return false;
}
@@ -1065,8 +1357,12 @@ __always_inline bool free_pages_prepare(struct page *page,
if (unlikely(order)) {
int i;
- if (compound)
- page[1].flags &= ~PAGE_FLAGS_SECOND;
+ if (compound) {
+ page[1].flags.f &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
+ }
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
@@ -1076,11 +1372,17 @@ __always_inline bool free_pages_prepare(struct page *page,
continue;
}
}
- (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
- if (PageMappingFlags(page))
- page->mapping = NULL;
+ if (folio_test_anon(folio)) {
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+ folio->mapping = NULL;
+ }
+ if (unlikely(page_has_type(page)))
+ /* Reset the page_type (which overlays _mapcount) */
+ page->page_type = UINT_MAX;
+
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
bad++;
@@ -1089,7 +1391,7 @@ __always_inline bool free_pages_prepare(struct page *page,
}
page_cpupid_reset_last(page);
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
@@ -1191,17 +1493,73 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_unlock_irqrestore(&zone->lock, flags);
}
+/* Split a multi-block free page into its individual pageblocks. */
+static void split_large_buddy(struct zone *zone, struct page *page,
+ unsigned long pfn, int order, fpi_t fpi)
+{
+ unsigned long end = pfn + (1 << order);
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
+ /* Caller removed page from freelist, buddy info cleared! */
+ VM_WARN_ON_ONCE(PageBuddy(page));
+
+ if (order > pageblock_order)
+ order = pageblock_order;
+
+ do {
+ int mt = get_pfnblock_migratetype(page, pfn);
+
+ __free_one_page(page, pfn, zone, order, mt, fpi);
+ pfn += 1 << order;
+ if (pfn == end)
+ break;
+ page = pfn_to_page(pfn);
+ } while (1);
+}
+
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+ unsigned int order)
+{
+ /* Remember the order */
+ page->private = order;
+ /* Add the page to the free list */
+ llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+}
+
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
+ struct llist_head *llhead;
unsigned long flags;
- int migratetype;
- spin_lock_irqsave(&zone->lock, flags);
- migratetype = get_pfnblock_migratetype(page, pfn);
- __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+
+ /* The lock succeeded. Process deferred pages. */
+ llhead = &zone->trylock_free_pages;
+ if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+ struct llist_node *llnode;
+ struct page *p, *tmp;
+
+ llnode = llist_del_all(llhead);
+ llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+ unsigned int p_order = p->private;
+
+ split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+ __count_vm_events(PGFREE, 1 << p_order);
+ }
+ }
+ split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
+
+ __count_vm_events(PGFREE, 1 << order);
}
static void __free_pages_ok(struct page *page, unsigned int order,
@@ -1210,15 +1568,12 @@ static void __free_pages_ok(struct page *page, unsigned int order,
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order))
- return;
-
- free_one_page(zone, page, pfn, order, fpi_flags);
-
- __count_vm_events(PGFREE, 1 << order);
+ if (free_pages_prepare(page, order))
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
-void __free_pages_core(struct page *page, unsigned int order)
+void __meminit __free_pages_core(struct page *page, unsigned int order,
+ enum meminit_context context)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1228,23 +1583,34 @@ void __free_pages_core(struct page *page, unsigned int order)
* When initializing the memmap, __init_single_page() sets the refcount
* of all pages to 1 ("allocated"/"not free"). We have to set the
* refcount of all involved pages to 0.
+ *
+ * Note that hotplugged memory pages are initialized to PageOffline().
+ * Pages freed from memblock might be marked as reserved.
*/
- prefetchw(p);
- for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
- prefetchw(p + 1);
- __ClearPageReserved(p);
- set_page_count(p, 0);
- }
- __ClearPageReserved(p);
- set_page_count(p, 0);
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) &&
+ unlikely(context == MEMINIT_HOTPLUG)) {
+ for (loop = 0; loop < nr_pages; loop++, p++) {
+ VM_WARN_ON_ONCE(PageReserved(p));
+ __ClearPageOffline(p);
+ set_page_count(p, 0);
+ }
+
+ adjust_managed_page_count(page, nr_pages);
+ } else {
+ for (loop = 0; loop < nr_pages; loop++, p++) {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
- atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+ /* memblock adjusts totalram_pages() manually. */
+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+ }
if (page_contains_unaccepted(page, order)) {
if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
return;
- accept_page(page, order);
+ accept_memory(page_to_phys(page), PAGE_SIZE << order);
}
/*
@@ -1320,11 +1686,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
*
* -- nyc
*/
-static inline void expand(struct zone *zone, struct page *page,
- int low, int high, int migratetype)
+static inline unsigned int expand(struct zone *zone, struct page *page, int low,
+ int high, int migratetype)
{
- unsigned long size = 1 << high;
- unsigned long nr_added = 0;
+ unsigned int size = 1 << high;
+ unsigned int nr_added = 0;
while (high > low) {
high--;
@@ -1344,14 +1710,27 @@ static inline void expand(struct zone *zone, struct page *page,
set_buddy_order(&page[size], high);
nr_added += size;
}
- account_freepages(zone, nr_added, migratetype);
+
+ return nr_added;
+}
+
+static __always_inline void page_del_and_expand(struct zone *zone,
+ struct page *page, int low,
+ int high, int migratetype)
+{
+ int nr_pages = 1 << high;
+
+ __del_page_from_free_list(page, zone, high, migratetype);
+ nr_pages -= expand(zone, page, low, high, migratetype);
+ account_freepages(zone, -nr_pages, migratetype);
}
static void check_new_page_bad(struct page *page)
{
- if (unlikely(page->flags & __PG_HWPOISON)) {
+ if (unlikely(PageHWPoison(page))) {
/* Don't complain about hwpoisoned pages */
- page_mapcount_reset(page); /* remove PageBuddy */
+ if (PageBuddy(page))
+ __ClearPageBuddy(page);
return;
}
@@ -1423,7 +1802,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
int i;
set_page_private(page, 0);
- set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
@@ -1445,14 +1823,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
* If memory tags should be zeroed
* (which happens only when memory should be initialized as well).
*/
- if (zero_tags) {
- /* Initialize both memory and memory tags. */
- for (i = 0; i != 1 << order; ++i)
- tag_clear_highpage(page + i);
+ if (zero_tags)
+ init = !tag_clear_highpages(page, 1 << order);
- /* Take note that memory was initialized by the loop above. */
- init = false;
- }
if (!should_skip_kasan_unpoison(gfp_flags) &&
kasan_unpoison_pages(page, order, init)) {
/* Take note that memory was initialized by KASAN. */
@@ -1513,8 +1886,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- del_page_from_free_list(page, zone, current_order, migratetype);
- expand(zone, page, order, current_order, migratetype);
+
+ page_del_and_expand(zone, page, order, current_order,
+ migratetype);
trace_mm_page_alloc_zone_locked(page, order, migratetype,
pcp_allowed_order(order) &&
migratetype < MIGRATE_PCPTYPES);
@@ -1549,8 +1923,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Change the type of a block and move all its free pages to that
- * type's freelist.
+ * Move all free pages of a block to new type's freelist. Caller needs to
+ * change the block type.
*/
static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
int old_mt, int new_mt)
@@ -1582,8 +1956,6 @@ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
pages_moved += 1 << order;
}
- set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
-
return pages_moved;
}
@@ -1628,7 +2000,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
- if (PageLRU(page) || __PageMovable(page))
+ if (PageLRU(page) || page_has_movable_ops(page))
(*num_movable)++;
pfn++;
}
@@ -1641,18 +2013,29 @@ static int move_freepages_block(struct zone *zone, struct page *page,
int old_mt, int new_mt)
{
unsigned long start_pfn;
+ int res;
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return -1;
- return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+
+ return res;
+
}
#ifdef CONFIG_MEMORY_ISOLATION
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{
- int order = 0;
+ /*
+ * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
+ * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
+ * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
+ * the starting order does not matter.
+ */
+ int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
struct page *page;
unsigned long pfn = start_pfn;
@@ -1673,32 +2056,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
return start_pfn;
}
-/* Split a multi-block free page into its individual pageblocks */
-static void split_large_buddy(struct zone *zone, struct page *page,
- unsigned long pfn, int order)
+static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
{
- unsigned long end_pfn = pfn + (1 << order);
-
- VM_WARN_ON_ONCE(order <= pageblock_order);
- VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1));
-
- /* Caller removed page from freelist, buddy info cleared! */
- VM_WARN_ON_ONCE(PageBuddy(page));
-
- while (pfn != end_pfn) {
- int mt = get_pfnblock_migratetype(page, pfn);
-
- __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE);
- pfn += pageblock_nr_pages;
- page = pfn_to_page(pfn);
- }
+ if (isolate)
+ set_pageblock_isolate(page);
+ else
+ clear_pageblock_isolate(page);
}
/**
- * move_freepages_block_isolate - move free pages in block for page isolation
+ * __move_freepages_block_isolate - move free pages in block for page isolation
* @zone: the zone
* @page: the pageblock page
- * @migratetype: migratetype to set on the pageblock
+ * @isolate: to isolate the given pageblock or unisolate it
*
* This is similar to move_freepages_block(), but handles the special
* case encountered in page isolation, where the block of interest
@@ -1713,10 +2083,19 @@ static void split_large_buddy(struct zone *zone, struct page *page,
*
* Returns %true if pages could be moved, %false otherwise.
*/
-bool move_freepages_block_isolate(struct zone *zone, struct page *page,
- int migratetype)
+static bool __move_freepages_block_isolate(struct zone *zone,
+ struct page *page, bool isolate)
{
- unsigned long start_pfn, pfn;
+ unsigned long start_pfn, buddy_pfn;
+ int from_mt;
+ int to_mt;
+ struct page *buddy;
+
+ if (isolate == get_pageblock_isolate(page)) {
+ VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
+ isolate ? "Isolate" : "Unisolate");
+ return false;
+ }
if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
return false;
@@ -1725,35 +2104,47 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
if (pageblock_order == MAX_PAGE_ORDER)
goto move;
- /* We're a tail block in a larger buddy */
- pfn = find_large_buddy(start_pfn);
- if (pfn != start_pfn) {
- struct page *buddy = pfn_to_page(pfn);
+ buddy_pfn = find_large_buddy(start_pfn);
+ buddy = pfn_to_page(buddy_pfn);
+ /* We're a part of a larger buddy */
+ if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) {
int order = buddy_order(buddy);
del_page_from_free_list(buddy, zone, order,
- get_pfnblock_migratetype(buddy, pfn));
- set_pageblock_migratetype(page, migratetype);
- split_large_buddy(zone, buddy, pfn, order);
+ get_pfnblock_migratetype(buddy, buddy_pfn));
+ toggle_pageblock_isolate(page, isolate);
+ split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE);
return true;
}
- /* We're the starting block of a larger buddy */
- if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
- int order = buddy_order(page);
-
- del_page_from_free_list(page, zone, order,
- get_pfnblock_migratetype(page, pfn));
- set_pageblock_migratetype(page, migratetype);
- split_large_buddy(zone, page, pfn, order);
- return true;
- }
move:
- __move_freepages_block(zone, start_pfn,
- get_pfnblock_migratetype(page, start_pfn),
- migratetype);
+ /* Use MIGRATETYPE_MASK to get non-isolate migratetype */
+ if (isolate) {
+ from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ to_mt = MIGRATE_ISOLATE;
+ } else {
+ from_mt = MIGRATE_ISOLATE;
+ to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+ MIGRATETYPE_MASK);
+ }
+
+ __move_freepages_block(zone, start_pfn, from_mt, to_mt);
+ toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);
+
return true;
}
+
+bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, true);
+}
+
+bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+ return __move_freepages_block_isolate(zone, page, false);
+}
+
#endif /* CONFIG_MEMORY_ISOLATION */
static void change_pageblock_range(struct page *pageblock_page,
@@ -1767,39 +2158,6 @@ static void change_pageblock_range(struct page *pageblock_page,
}
}
-/*
- * When we are falling back to another migratetype during allocation, try to
- * steal extra free pages from the same pageblocks to satisfy further
- * allocations, instead of polluting multiple pageblocks.
- *
- * If we are stealing a relatively large buddy page, it is likely there will
- * be more free pages in the pageblock, so try to steal them all. For
- * reclaimable and unmovable allocations, we steal regardless of page size,
- * as fragmentation caused by those allocations polluting movable pageblocks
- * is worse than movable allocations stealing from unmovable and reclaimable
- * pageblocks.
- */
-static bool can_steal_fallback(unsigned int order, int start_mt)
-{
- /*
- * Leaving this order check is intended, although there is
- * relaxed order check in next check. The reason is that
- * we can actually steal whole pageblock if this condition met,
- * but, below check doesn't guarantee it and that is just heuristic
- * so could be changed anytime.
- */
- if (order >= pageblock_order)
- return true;
-
- if (order >= pageblock_order / 2 ||
- start_mt == MIGRATE_RECLAIMABLE ||
- start_mt == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled)
- return true;
-
- return false;
-}
-
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
@@ -1838,36 +2196,102 @@ static inline bool boost_watermark(struct zone *zone)
}
/*
- * This function implements actual steal behaviour. If order is large enough, we
- * can claim the whole pageblock for the requested migratetype. If not, we check
- * the pageblock for constituent pages; if at least half of the pages are free
- * or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list. Otherwise, we isolate exactly
- * the order we need from the fallback block and leave its migratetype alone.
+ * When we are falling back to another migratetype during allocation, should we
+ * try to claim an entire block to satisfy further allocations, instead of
+ * polluting multiple pageblocks?
*/
-static struct page *
-steal_suitable_fallback(struct zone *zone, struct page *page,
- int current_order, int order, int start_type,
- unsigned int alloc_flags, bool whole_block)
+static bool should_try_claim_block(unsigned int order, int start_mt)
{
- int free_pages, movable_pages, alike_pages;
- unsigned long start_pfn;
- int block_type;
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually claim the whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ /*
+ * Above a certain threshold, always try to claim, as it's likely there
+ * will be more free pages in the pageblock.
+ */
+ if (order >= pageblock_order / 2)
+ return true;
+
+ /*
+ * Unmovable/reclaimable allocations would cause permanent
+ * fragmentations if they fell back to allocating from a movable block
+ * (polluting it), so we try to claim the whole block regardless of the
+ * allocation size. Later movable allocations can always steal from this
+ * block, which is less problematic.
+ */
+ if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
+ return true;
- block_type = get_pageblock_migratetype(page);
+ if (page_group_by_mobility_disabled)
+ return true;
/*
- * This can happen due to races and we want to prevent broken
- * highatomic accounting.
+ * Movable pages won't cause permanent fragmentation, so when you alloc
+ * small pages, we just need to temporarily steal unmovable or
+ * reclaimable pages that are closest to the request size. After a
+ * while, memory compaction may occur to form large contiguous pages,
+ * and the next movable allocation may not need to steal.
*/
- if (is_migrate_highatomic(block_type))
- goto single_page;
+ return false;
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If claimable is true, this function returns fallback_mt only if
+ * we would do this whole-block claiming. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool claimable)
+{
+ int i;
+
+ if (claimable && !should_try_claim_block(order, migratetype))
+ return -2;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
+ int fallback_mt = fallbacks[migratetype][i];
+
+ if (!free_area_empty(area, fallback_mt))
+ return fallback_mt;
+ }
+
+ return -1;
+}
+
+/*
+ * This function implements actual block claiming behaviour. If order is large
+ * enough, we can claim the whole pageblock for the requested migratetype. If
+ * not, we check the pageblock for constituent pages; if at least half of the
+ * pages are free or compatible, we can still claim the whole block, so pages
+ * freed in the future will be put on the correct free list.
+ */
+static struct page *
+try_to_claim_block(struct zone *zone, struct page *page,
+ int current_order, int order, int start_type,
+ int block_type, unsigned int alloc_flags)
+{
+ int free_pages, movable_pages, alike_pages;
+ unsigned long start_pfn;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
+ unsigned int nr_added;
+
del_page_from_free_list(page, zone, current_order, block_type);
change_pageblock_range(page, current_order, start_type);
- expand(zone, page, order, current_order, start_type);
+ nr_added = expand(zone, page, order, current_order, start_type);
+ account_freepages(zone, nr_added, start_type);
return page;
}
@@ -1879,14 +2303,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
- /* We are not allowed to try stealing from the whole block */
- if (!whole_block)
- goto single_page;
-
/* moving whole block can fail due to zone boundary conditions */
if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
&movable_pages))
- goto single_page;
+ return NULL;
/*
* Determine how many pages are compatible with our allocation.
@@ -1916,205 +2336,23 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled) {
__move_freepages_block(zone, start_pfn, block_type, start_type);
+ set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
return __rmqueue_smallest(zone, order, start_type);
}
-single_page:
- del_page_from_free_list(page, zone, current_order, block_type);
- expand(zone, page, order, current_order, block_type);
- return page;
-}
-
-/*
- * Check whether there is a suitable fallback freepage with requested order.
- * If only_stealable is true, this function returns fallback_mt only if
- * we can steal other freepages all together. This would help to reduce
- * fragmentation due to mixed migratetype pages in one pageblock.
- */
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal)
-{
- int i;
- int fallback_mt;
-
- if (area->nr_free == 0)
- return -1;
-
- *can_steal = false;
- for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
-
- if (can_steal_fallback(order, migratetype))
- *can_steal = true;
-
- if (!only_stealable)
- return fallback_mt;
-
- if (*can_steal)
- return fallback_mt;
- }
-
- return -1;
-}
-
-/*
- * Reserve the pageblock(s) surrounding an allocation request for
- * exclusive use of high-order atomic allocations if there are no
- * empty page blocks that contain a page with a suitable order
- */
-static void reserve_highatomic_pageblock(struct page *page, int order,
- struct zone *zone)
-{
- int mt;
- unsigned long max_managed, flags;
-
- /*
- * The number reserved as: minimum is 1 pageblock, maximum is
- * roughly 1% of a zone. But if 1% of a zone falls below a
- * pageblock size, then don't reserve any pageblocks.
- * Check is race-prone but harmless.
- */
- if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
- return;
- max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
- if (zone->nr_reserved_highatomic >= max_managed)
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- /* Recheck the nr_reserved_highatomic limit under the lock */
- if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
-
- /* Yoink! */
- mt = get_pageblock_migratetype(page);
- /* Only reserve normal pageblocks (i.e., they can merge with others) */
- if (!migratetype_is_mergeable(mt))
- goto out_unlock;
-
- if (order < pageblock_order) {
- if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
- zone->nr_reserved_highatomic += pageblock_nr_pages;
- } else {
- change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
- zone->nr_reserved_highatomic += 1 << order;
- }
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
-}
-
-/*
- * Used when an allocation is about to fail under memory pressure. This
- * potentially hurts the reliability of high-order allocations when under
- * intense memory pressure but failed atomic allocations should be easier
- * to recover from than an OOM.
- *
- * If @force is true, try to unreserve pageblocks even though highatomic
- * pageblock is exhausted.
- */
-static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
- bool force)
-{
- struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
- struct zoneref *z;
- struct zone *zone;
- struct page *page;
- int order;
- int ret;
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
- ac->nodemask) {
- /*
- * Preserve at least one pageblock unless memory pressure
- * is really high.
- */
- if (!force && zone->nr_reserved_highatomic <=
- pageblock_nr_pages)
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < NR_PAGE_ORDERS; order++) {
- struct free_area *area = &(zone->free_area[order]);
- int mt;
-
- page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
- if (!page)
- continue;
-
- mt = get_pageblock_migratetype(page);
- /*
- * In page freeing path, migratetype change is racy so
- * we can counter several free pages in a pageblock
- * in this loop although we changed the pageblock type
- * from highatomic to ac->migratetype. So we should
- * adjust the count once.
- */
- if (is_migrate_highatomic(mt)) {
- unsigned long size;
- /*
- * It should never happen but changes to
- * locking could inadvertently allow a per-cpu
- * drain to add pages to MIGRATE_HIGHATOMIC
- * while unreserving so be safe and watch for
- * underflows.
- */
- size = max(pageblock_nr_pages, 1UL << order);
- size = min(size, zone->nr_reserved_highatomic);
- zone->nr_reserved_highatomic -= size;
- }
-
- /*
- * Convert to ac->migratetype and avoid the normal
- * pageblock stealing heuristics. Minimally, the caller
- * is doing the work and needs the pages. More
- * importantly, if the block was always converted to
- * MIGRATE_UNMOVABLE or another type then the number
- * of pageblocks that cannot be completely freed
- * may increase.
- */
- if (order < pageblock_order)
- ret = move_freepages_block(zone, page, mt,
- ac->migratetype);
- else {
- move_to_free_list(page, zone, order, mt,
- ac->migratetype);
- change_pageblock_range(page, order,
- ac->migratetype);
- ret = 1;
- }
- /*
- * Reserving the block(s) already succeeded,
- * so this should not fail on zone boundaries.
- */
- WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-
- return false;
+ return NULL;
}
/*
- * Try finding a free buddy page on the fallback list and put it on the free
- * list of requested migratetype, possibly along with other pages from the same
- * block, depending on fragmentation avoidance heuristics. Returns true if
- * fallback was found so that __rmqueue_smallest() can grab it.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2122,7 +2360,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2141,62 +2378,73 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
+ start_migratetype, true);
+
+ /* No block in that order */
if (fallback_mt == -1)
continue;
- /*
- * We cannot steal all free pages from the pageblock and the
- * requested migratetype is movable. In that case it's better to
- * steal and split the smallest available page instead of the
- * largest available page, because even if the next movable
- * allocation falls back into a different pageblock than this
- * one, it won't cause permanent fragmentation.
- */
- if (!can_steal && start_migratetype == MIGRATE_MOVABLE
- && current_order > order)
- goto find_smallest;
+ /* Advanced into orders too low to claim, abort */
+ if (fallback_mt == -2)
+ break;
- goto do_steal;
+ page = get_page_from_free_area(area, fallback_mt);
+ page = try_to_claim_block(zone, page, current_order, order,
+ start_migratetype, fallback_mt,
+ alloc_flags);
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
-find_smallest:
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
- if (fallback_mt != -1)
- break;
- }
-
- /*
- * This should not happen - we already found a suitable fallback
- * when looking for the largest page.
- */
- VM_BUG_ON(current_order > MAX_PAGE_ORDER);
-
-do_steal:
- page = get_page_from_free_area(area, fallback_mt);
-
- /* take off list, maybe claim block, expand remainder */
- page = steal_suitable_fallback(zone, page, current_order, order,
- start_migratetype, alloc_flags, can_steal);
+ start_migratetype, false);
+ if (fallback_mt == -1)
+ continue;
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ page = get_page_from_free_area(area, fallback_mt);
+ page_del_and_expand(zone, page, order, current_order, fallback_mt);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
- return page;
+ return NULL;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2215,16 +2463,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2236,13 +2516,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
+ return 0;
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2267,10 +2553,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to decay the PCP high.
* Return whether there are addition works to do.
*/
-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
- int high_min, to_drain, batch;
- int todo = 0;
+ int high_min, to_drain, to_drain_batched, batch;
+ bool todo = false;
high_min = READ_ONCE(pcp->high_min);
batch = READ_ONCE(pcp->batch);
@@ -2283,15 +2569,18 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
pcp->high - (pcp->high >> 3), high_min);
if (pcp->high > high_min)
- todo++;
+ todo = true;
}
to_drain = pcp->count - pcp->high;
- if (to_drain > 0) {
+ while (to_drain > 0) {
+ to_drain_batched = min(to_drain, batch);
spin_lock(&pcp->lock);
- free_pcppages_bulk(zone, to_drain, pcp, 0);
+ free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
spin_unlock(&pcp->lock);
- todo++;
+ todo = true;
+
+ to_drain -= to_drain_batched;
}
return todo;
@@ -2323,16 +2612,20 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- int count = READ_ONCE(pcp->count);
-
- while (count) {
- int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
- count -= to_drain;
+ int count;
+ do {
spin_lock(&pcp->lock);
- free_pcppages_bulk(zone, to_drain, pcp, 0);
+ count = pcp->count;
+ if (count) {
+ int to_drain = min(count,
+ pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
+
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
+ count -= to_drain;
+ }
spin_unlock(&pcp->lock);
- }
+ } while (count);
}
/*
@@ -2521,12 +2814,22 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return high;
}
-static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
- struct page *page, int migratetype,
- unsigned int order)
+/*
+ * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the
+ * pcp's watermarks below high.
+ *
+ * May return a freed pcp, if during page freeing the pcp spinlock cannot be
+ * reacquired. Return true if pcp is locked, false otherwise.
+ */
+static bool free_frozen_page_commit(struct zone *zone,
+ struct per_cpu_pages *pcp, struct page *page, int migratetype,
+ unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
{
int high, batch;
+ int to_free, to_free_batched;
int pindex;
+ int cpu = smp_processor_id();
+ int ret = true;
bool free_high = false;
/*
@@ -2548,40 +2851,92 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
* stops will be drained from vmstat refresh context.
*/
if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
- free_high = (pcp->free_count >= batch &&
+ free_high = (pcp->free_count >= (batch + pcp->high_min / 2) &&
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
- pcp->count >= READ_ONCE(batch)));
+ pcp->count >= batch));
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
+
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ /*
+ * Do not attempt to take a zone lock. Let pcp->count get
+ * over high mark temporarily.
+ */
+ return true;
+ }
+
high = nr_pcp_high(pcp, zone, batch, free_high);
- if (pcp->count >= high) {
- free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
- pcp, pindex);
- if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
- zone_watermark_ok(zone, 0, high_wmark_pages(zone),
- ZONE_MOVABLE, 0))
- clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+ if (pcp->count < high)
+ return true;
+
+ to_free = nr_pcp_free(pcp, batch, high, free_high);
+ while (to_free > 0 && pcp->count > 0) {
+ to_free_batched = min(to_free, batch);
+ free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
+ to_free -= to_free_batched;
+
+ if (to_free == 0 || pcp->count == 0)
+ break;
+
+ pcp_spin_unlock(pcp, *UP_flags);
+
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags);
+ if (!pcp) {
+ ret = false;
+ break;
+ }
+
+ /*
+ * Check if this thread has been migrated to a different CPU.
+ * If that is the case, give up and indicate that the pcp is
+ * returned in an unlocked state.
+ */
+ if (smp_processor_id() != cpu) {
+ pcp_spin_unlock(pcp, *UP_flags);
+ ret = false;
+ break;
+ }
+ }
+
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+ zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+ ZONE_MOVABLE, 0)) {
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+ /*
+ * Assume that memory pressure on this node is gone and may be
+ * in a reclaimable state. If a memory fallback node exists,
+ * direct reclaim may not have been triggered, causing a
+ * 'hopeless node' to stay in that state for a while. Let
+ * kswapd work again by resetting kswapd_failures.
+ */
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
+ next_memory_node(pgdat->node_id) < MAX_NUMNODES)
+ atomic_set(&pgdat->kswapd_failures, 0);
}
+ return ret;
}
/*
* Free a pcp page
*/
-void free_unref_page(struct page *page, unsigned int order)
+static void __free_frozen_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!pcp_allowed_order(order)) {
- __free_pages_ok(page, order, FPI_NONE);
+ __free_pages_ok(page, order, fpi_flags);
return;
}
@@ -2595,25 +2950,35 @@ void free_unref_page(struct page *page, unsigned int order)
* get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
+ zone = page_zone(page);
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- zone = page_zone(page);
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
+ && (in_nmi() || in_hardirq()))) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, migratetype, order);
- pcp_spin_unlock(pcp);
+ if (!free_frozen_page_commit(zone, pcp, page, migratetype,
+ order, fpi_flags, &UP_flags))
+ return;
+ pcp_spin_unlock(pcp, UP_flags);
} else {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
- pcp_trylock_finish(UP_flags);
+}
+
+void free_frozen_pages(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_NONE);
}
/*
@@ -2621,7 +2986,7 @@ void free_unref_page(struct page *page, unsigned int order)
*/
void free_unref_folios(struct folio_batch *folios)
{
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
int i, j;
@@ -2632,8 +2997,6 @@ void free_unref_folios(struct folio_batch *folios)
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
- if (order > 0 && folio_test_large_rmappable(folio))
- folio_undo_large_rmappable(folio);
if (!free_pages_prepare(&folio->page, order))
continue;
/*
@@ -2666,15 +3029,14 @@ void free_unref_folios(struct folio_batch *folios)
if (zone != locked_zone ||
is_migrate_isolate(migratetype)) {
if (pcp) {
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
locked_zone = NULL;
pcp = NULL;
}
/*
* Free isolated pages directly to the
- * allocator, see comment in free_unref_page.
+ * allocator, see comment in free_frozen_pages.
*/
if (is_migrate_isolate(migratetype)) {
free_one_page(zone, &folio->page, pfn,
@@ -2686,10 +3048,8 @@ void free_unref_folios(struct folio_batch *folios)
* trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion.
*/
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (unlikely(!pcp)) {
- pcp_trylock_finish(UP_flags);
free_one_page(zone, &folio->page, pfn,
order, FPI_NONE);
continue;
@@ -2705,14 +3065,15 @@ void free_unref_folios(struct folio_batch *folios)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
- free_unref_page_commit(zone, pcp, &folio->page, migratetype,
- order);
+ if (!free_frozen_page_commit(zone, pcp, &folio->page,
+ migratetype, order, FPI_NONE, &UP_flags)) {
+ pcp = NULL;
+ locked_zone = NULL;
+ }
}
- if (pcp) {
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
- }
+ if (pcp)
+ pcp_spin_unlock(pcp, UP_flags);
folio_batch_reinit(folios);
}
@@ -2734,8 +3095,8 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
- pgalloc_tag_split(page, 1 << order);
- split_page_memcg(page, order, 0);
+ pgalloc_tag_split(page_folio(page), order, 0);
+ split_page_memcg(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2837,19 +3198,26 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
+ return NULL;
+ } else {
+ spin_lock_irqsave(&zone->lock, flags);
+ }
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
- * If the allocation fails, allow OOM handling access
- * to HIGHATOMIC reserves as failing now is worse than
- * failing a high-order atomic allocation in the
- * future.
+ * If the allocation fails, allow OOM handling and
+ * order-0 (atomic) allocs access to HIGHATOMIC
+ * reserves as failing now is worse than failing a
+ * high-order atomic allocation in the future.
*/
- if (!page && (alloc_flags & ALLOC_OOM))
+ if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
@@ -2956,15 +3324,12 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
- if (!pcp) {
- pcp_trylock_finish(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+ if (!pcp)
return NULL;
- }
/*
* On allocation, reduce the number of pages that are batch freed.
@@ -2974,8 +3339,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
@@ -2990,7 +3354,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
/*
* Do not instrument rmqueue() with KMSAN. This function may call
- * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * __msan_poison_alloca() through a call to set_pfnblock_migratetype().
* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
* may call rmqueue() again, which will result in a deadlock.
*/
@@ -3003,12 +3367,6 @@ struct page *rmqueue(struct zone *preferred_zone,
{
struct page *page;
- /*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
if (likely(pcp_allowed_order(order))) {
page = rmqueue_pcplist(preferred_zone, zone, order,
migratetype, alloc_flags);
@@ -3031,11 +3389,141 @@ out:
return page;
}
-noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+/*
+ * Reserve the pageblock(s) surrounding an allocation request for
+ * exclusive use of high-order atomic allocations if there are no
+ * empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, int order,
+ struct zone *zone)
{
- return __should_fail_alloc_page(gfp_mask, order);
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * The number reserved as: minimum is 1 pageblock, maximum is
+ * roughly 1% of a zone. But if 1% of a zone falls below a
+ * pageblock size, then don't reserve any pageblocks.
+ * Check is race-prone but harmless.
+ */
+ if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
+ return;
+ max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (!migratetype_is_mergeable(mt))
+ goto out_unlock;
+
+ if (order < pageblock_order) {
+ if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
+ goto out_unlock;
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ } else {
+ change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
+ zone->nr_reserved_highatomic += 1 << order;
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve pageblocks even though highatomic
+ * pageblock is exhausted.
+ */
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+ int ret;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ ac->nodemask) {
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+ unsigned long size;
+
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ if (!page)
+ continue;
+
+ size = max(pageblock_nr_pages, 1UL << order);
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
+ size = zone->nr_reserved_highatomic;
+ zone->nr_reserved_highatomic -= size;
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ if (order < pageblock_order)
+ ret = move_freepages_block(zone, page,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ else {
+ move_to_free_list(page, zone, order,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ change_pageblock_range(page, order,
+ ac->migratetype);
+ ret = 1;
+ }
+ /*
+ * Reserving the block(s) already succeeded,
+ * so this should not fail on zone boundaries.
+ */
+ WARN_ON_ONCE(ret == -1);
+ if (ret > 0) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ return false;
}
-ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
@@ -3044,20 +3532,16 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
/*
* If the caller does not have rights to reserves below the min
- * watermark then subtract the high-atomic reserves. This will
- * over-estimate the size of the atomic reserve but it avoids a search.
+ * watermark then subtract the free pages reserved for highatomic.
*/
if (likely(!(alloc_flags & ALLOC_RESERVES)))
- unusable_free += z->nr_reserved_highatomic;
+ unusable_free += READ_ONCE(z->nr_free_highatomic);
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
-#ifdef CONFIG_UNACCEPTED_MEMORY
- unusable_free += zone_page_state(z, NR_UNACCEPTED);
-#endif
return unusable_free;
}
@@ -3198,18 +3682,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
return false;
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx)
-{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
-}
-
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
@@ -3244,6 +3716,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
+ if (defrag_mode) {
+ alloc_flags |= ALLOC_NOFRAGMENT;
+ return alloc_flags;
+ }
+
#ifdef CONFIG_ZONE_DMA32
if (!zone)
return alloc_flags;
@@ -3289,11 +3766,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
struct pglist_data *last_pgdat = NULL;
bool last_pgdat_dirty_ok = false;
bool no_fallback;
+ bool skip_kswapd_nodes = nr_online_nodes > 1;
+ bool skipped_kswapd_nodes = false;
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+ * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -3335,8 +3814,8 @@ retry:
continue;
}
- if (no_fallback && nr_online_nodes > 1 &&
- zone != ac->preferred_zoneref->zone) {
+ if (no_fallback && !defrag_mode && nr_online_nodes > 1 &&
+ zone != zonelist_zone(ac->preferred_zoneref)) {
int local_nid;
/*
@@ -3344,7 +3823,7 @@ retry:
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
- local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+ local_nid = zonelist_node_idx(ac->preferred_zoneref);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
@@ -3352,6 +3831,21 @@ retry:
}
/*
+ * If kswapd is already active on a node, keep looking
+ * for other nodes that might be idle. This can happen
+ * if another process has NUMA bindings and is causing
+ * kswapd wakeups on only some nodes. Avoid accidental
+ * "node_reclaim_mode"-like behavior in this case.
+ */
+ if (skip_kswapd_nodes &&
+ !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
+ skipped_kswapd_nodes = true;
+ continue;
+ }
+
+ cond_accept_memory(zone, order, alloc_flags);
+
+ /*
* Detect whether the number of free pages is below high
* watermark. If so, we will decrease pcp->high and free
* PCP pages in free path to reduce the possibility of
@@ -3376,12 +3870,9 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order, alloc_flags))
+ goto try_this_zone;
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
@@ -3390,14 +3881,13 @@ check_alloc_wmark:
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
-#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (!node_reclaim_enabled() ||
- !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
+ !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
@@ -3419,7 +3909,7 @@ check_alloc_wmark:
}
try_this_zone:
- page = rmqueue(ac->preferred_zoneref->zone, zone, order,
+ page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -3433,26 +3923,31 @@ try_this_zone:
return page;
} else {
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order, alloc_flags))
+ goto try_this_zone;
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
-#endif
}
}
/*
+ * If we skipped over nodes with active kswapds and found no
+ * idle nodes, retry and place anywhere the watermarks permit.
+ */
+ if (skip_kswapd_nodes && skipped_kswapd_nodes) {
+ skip_kswapd_nodes = false;
+ goto retry;
+ }
+
+ /*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
- if (no_fallback) {
+ if (no_fallback && !defrag_mode) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
@@ -3477,6 +3972,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
filter &= ~SHOW_MEM_FILTER_NODES;
__show_mem(filter, nodemask, gfp_zone(gfp_mask));
+ mem_cgroup_show_protected_memory(NULL);
}
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -3520,7 +4016,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);
-
return page;
}
@@ -3754,7 +4249,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -3932,15 +4427,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
+ unsigned int reclaim_order;
+
+ if (defrag_mode)
+ reclaim_order = max(order, pageblock_order);
+ else
+ reclaim_order = order;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (!managed_zone(zone))
continue;
- if (last_pgdat != zone->zone_pgdat) {
- wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
- }
+ if (last_pgdat == zone->zone_pgdat)
+ continue;
+ wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx);
+ last_pgdat = zone->zone_pgdat;
}
}
@@ -3974,22 +4475,25 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
alloc_flags |= ALLOC_NON_BLOCK;
- if (order > 0)
+ if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
alloc_flags |= ALLOC_HIGHATOMIC;
}
/*
* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
* GFP_ATOMIC) rather than fail, see the comment for
- * cpuset_node_allowed().
+ * cpuset_current_node_allowed().
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(current)) && in_task())
+ } else if (unlikely(rt_or_dl_task(current)) && in_task())
alloc_flags |= ALLOC_MIN_RESERVE;
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+ if (defrag_mode)
+ alloc_flags |= ALLOC_NOFRAGMENT;
+
return alloc_flags;
}
@@ -4081,6 +4585,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
unsigned long min_wmark = min_wmark_pages(zone);
bool wmark;
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+
available = reclaimable = zone_reclaimable_pages(zone);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
@@ -4156,6 +4665,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
bool can_compact = gfp_compaction_allowed(gfp_mask);
+ bool nofail = gfp_mask & __GFP_NOFAIL;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@@ -4168,9 +4678,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int zonelist_iter_cookie;
int reserve_flags;
+ if (unlikely(nofail)) {
+ /*
+ * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
+ * otherwise, we may result in lockup.
+ */
+ WARN_ON_ONCE(!can_direct_reclaim);
+ /*
+ * PF_MEMALLOC request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us.
+ */
+ WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+ }
+
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -4190,7 +4715,7 @@ restart:
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
- if (!ac->preferred_zoneref->zone)
+ if (!zonelist_zone(ac->preferred_zoneref))
goto nopage;
/*
@@ -4202,7 +4727,7 @@ restart:
struct zoneref *z = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx,
&cpuset_current_mems_allowed);
- if (!z->zone)
+ if (!zonelist_zone(z))
goto nopage;
}
@@ -4273,6 +4798,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4346,6 +4879,11 @@ retry:
&compaction_retries))
goto retry;
+ /* Reclaim/compaction failed to prevent the fallback */
+ if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
/*
* Deal with possible cpuset update races or zonelist updates to avoid
@@ -4385,30 +4923,16 @@ nopage:
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
- if (gfp_mask & __GFP_NOFAIL) {
+ if (unlikely(nofail)) {
/*
- * All existing users of the __GFP_NOFAIL are blockable, so warn
- * of any new users that actually require GFP_NOWAIT
+ * Lacking direct_reclaim we can't do anything to reclaim memory,
+ * we disregard these unreasonable nofail requests and still
+ * return NULL
*/
- if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
+ if (!can_direct_reclaim)
goto fail;
/*
- * PF_MEMALLOC request from this context is rather bizarre
- * because we cannot reclaim anything and only can loop waiting
- * for somebody to do a work for us
- */
- WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
-
- /*
- * non failing costly orders are a hard requirement which we
- * are not prepared for much so let's warn about these users
- * so that we can identify them and convert them to something
- * else.
- */
- WARN_ON_ONCE_GFP(costly_order, gfp_mask);
-
- /*
* Help non-failing allocations by giving some access to memory
* reserves normally used for high priority non-blocking
* allocations but do not use ALLOC_NO_WATERMARKS because this
@@ -4453,7 +4977,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
- if (should_fail_alloc_page(gfp_mask, order))
+ /*
+ * Don't invoke should_fail logic, since it may call
+ * get_random_u32() and printk() which need to spin_lock.
+ */
+ if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+ should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -4473,32 +5002,32 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
* @gfp: GFP flags for the allocation
* @preferred_nid: The preferred NUMA node ID to allocate from
* @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list or array
- * @page_list: Optional list to store the allocated pages
- * @page_array: Optional array to store the pages
+ * @nr_pages: The number of pages desired in the array
+ * @page_array: Array to store the pages
*
- * This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly. Pages are added to page_list if page_list
- * is not NULL, otherwise it is assumed that the page_array is valid.
+ * This is a batched version of the page allocator that attempts to allocate
+ * @nr_pages quickly. Pages are added to @page_array.
*
- * For lists, nr_pages is the number of pages that should be allocated.
+ * Note that only the elements in @page_array that were cleared to %NULL on
+ * entry are populated with newly allocated pages. @nr_pages is the maximum
+ * number of pages that will be stored in the array.
*
- * For arrays, only NULL elements are populated with pages and nr_pages
- * is the maximum number of pages that will be stored in the array.
- *
- * Returns the number of pages on the list or array.
+ * Returns the number of pages in @page_array, including ones already
+ * allocated on entry. This can be less than the number requested in @nr_pages,
+ * but all empty slots are filled from the beginning. I.e., if all slots in
+ * @page_array were set to %NULL on entry, the slots from 0 to the return value
+ * - 1 will be filled.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array)
{
struct page *page;
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct zone *zone;
struct zoneref *z;
struct per_cpu_pages *pcp;
@@ -4512,7 +5041,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
*/
- while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ while (nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
/* No pages requested? */
@@ -4520,7 +5049,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto out;
/* Already populated array? */
- if (unlikely(page_array && nr_pages - nr_populated == 0))
+ if (unlikely(nr_pages - nr_populated == 0))
goto out;
/* Bulk allocator does not support memcg accounting. */
@@ -4551,7 +5080,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
- for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+ z = ac.preferred_zoneref;
+ for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) {
unsigned long mark;
if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
@@ -4559,17 +5089,28 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
continue;
}
- if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
- zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+ if (nr_online_nodes > 1 && zone != zonelist_zone(ac.preferred_zoneref) &&
+ zone_to_nid(zone) != zonelist_node_idx(ac.preferred_zoneref)) {
goto failed;
}
+ cond_accept_memory(zone, 0, alloc_flags);
+retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
zonelist_zone_idx(ac.preferred_zoneref),
alloc_flags, gfp)) {
break;
}
+
+ if (cond_accept_memory(zone, 0, alloc_flags))
+ goto retry_this_zone;
+
+ /* Try again if zone has deferred pages */
+ if (deferred_pages_enabled()) {
+ if (_deferred_grow_zone(zone, 0))
+ goto retry_this_zone;
+ }
}
/*
@@ -4580,17 +5121,16 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto failed;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (!pcp)
- goto failed_irq;
+ goto failed;
/* Attempt the batch allocation */
pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
while (nr_populated < nr_pages) {
/* Skip existing pages */
- if (page_array && page_array[nr_populated]) {
+ if (page_array[nr_populated]) {
nr_populated++;
continue;
}
@@ -4600,43 +5140,30 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
if (unlikely(!page)) {
/* Try and allocate at least one page */
if (!nr_account) {
- pcp_spin_unlock(pcp);
- goto failed_irq;
+ pcp_spin_unlock(pcp, UP_flags);
+ goto failed;
}
break;
}
nr_account++;
prep_new_page(page, 0, gfp, 0);
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
+ set_page_refcounted(page);
+ page_array[nr_populated++] = page;
}
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
- zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+ zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
out:
return nr_populated;
-failed_irq:
- pcp_trylock_finish(UP_flags);
-
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
- if (page) {
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
- }
-
+ if (page)
+ page_array[nr_populated++] = page;
goto out;
}
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
@@ -4644,8 +5171,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
- int preferred_nid, nodemask_t *nodemask)
+struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4677,7 +5204,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
- alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
+ alloc_flags |= alloc_flags_nofragment(zonelist_zone(ac.preferred_zoneref), gfp);
/* First allocation attempt */
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
@@ -4698,7 +5225,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
- __free_pages(page, order);
+ free_frozen_pages(page, order);
page = NULL;
}
@@ -4707,6 +5234,18 @@ out:
return page;
}
+EXPORT_SYMBOL(__alloc_frozen_pages_noprof);
+
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
+{
+ struct page *page;
+
+ page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
EXPORT_SYMBOL(__alloc_pages_noprof);
struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
@@ -4740,6 +5279,31 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page_noprof);
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
+{
+ /* get PageHead before we drop reference */
+ int head = PageHead(page);
+ /* get alloc tag in case the page is released by others */
+ struct alloc_tag *tag = pgalloc_tag_get(page);
+
+ if (put_page_testzero(page))
+ __free_frozen_pages(page, order, fpi_flags);
+ else if (!head) {
+ pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+ while (order-- > 0) {
+ /*
+ * The "tail" pages of this non-compound high-order
+ * page will have no code tags, so to avoid warnings
+ * mark them as empty.
+ */
+ clear_page_tag_ref(page + (1 << order));
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
+ }
+ }
+}
+
/**
* __free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
@@ -4762,165 +5326,37 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
*/
void __free_pages(struct page *page, unsigned int order)
{
- /* get PageHead before we drop reference */
- int head = PageHead(page);
- struct alloc_tag *tag = pgalloc_tag_get(page);
-
- if (put_page_testzero(page))
- free_unref_page(page, order);
- else if (!head) {
- pgalloc_tag_sub_pages(tag, (1 << order) - 1);
- while (order-- > 0)
- free_unref_page(page + (1 << order), order);
- }
+ ___free_pages(page, order, FPI_NONE);
}
EXPORT_SYMBOL(__free_pages);
-void free_pages(unsigned long addr, unsigned int order)
-{
- if (addr != 0) {
- VM_BUG_ON(!virt_addr_valid((void *)addr));
- __free_pages(virt_to_page((void *)addr), order);
- }
-}
-
-EXPORT_SYMBOL(free_pages);
-
/*
- * Page Fragment:
- * An arbitrary-length arbitrary-offset area of memory which resides
- * within a 0 or higher order page. Multiple fragments within that page
- * are individually refcounted, in the page's reference counter.
- *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments. This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ * Can be called while holding raw_spin_lock or from IRQ and NMI for any
+ * page type (not only those that came from alloc_pages_nolock)
*/
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
- gfp_t gfp_mask)
-{
- struct page *page = NULL;
- gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP |
- __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
- page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
- PAGE_FRAG_CACHE_MAX_ORDER);
- nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
- if (unlikely(!page))
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
- nc->va = page ? page_address(page) : NULL;
-
- return page;
-}
-
-void page_frag_cache_drain(struct page_frag_cache *nc)
-{
- if (!nc->va)
- return;
-
- __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
- nc->va = NULL;
-}
-EXPORT_SYMBOL(page_frag_cache_drain);
-
-void __page_frag_cache_drain(struct page *page, unsigned int count)
+void free_pages_nolock(struct page *page, unsigned int order)
{
- VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
- if (page_ref_sub_and_test(page, count))
- free_unref_page(page, compound_order(page));
+ ___free_pages(page, order, FPI_TRYLOCK);
}
-EXPORT_SYMBOL(__page_frag_cache_drain);
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask,
- unsigned int align_mask)
+/**
+ * free_pages - Free pages allocated with __get_free_pages().
+ * @addr: The virtual address tied to a page returned from __get_free_pages().
+ * @order: The order of the allocation.
+ *
+ * This function behaves the same as __free_pages(). Use this function
+ * to free pages when you only have a valid virtual address. If you have
+ * the page, call __free_pages() instead.
+ */
+void free_pages(unsigned long addr, unsigned int order)
{
- unsigned int size = PAGE_SIZE;
- struct page *page;
- int offset;
-
- if (unlikely(!nc->va)) {
-refill:
- page = __page_frag_cache_refill(nc, gfp_mask);
- if (!page)
- return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
- /* Even if we own the page, we do not use atomic_set().
- * This would break get_page_unless_zero() users.
- */
- page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
- /* reset page count bias and offset to start of new frag */
- nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
- nc->offset = size;
- }
-
- offset = nc->offset - fragsz;
- if (unlikely(offset < 0)) {
- page = virt_to_page(nc->va);
-
- if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
- goto refill;
-
- if (unlikely(nc->pfmemalloc)) {
- free_unref_page(page, compound_order(page));
- goto refill;
- }
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
- /* OK, page count is 0, we can safely set it */
- set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
- /* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
- offset = size - fragsz;
- if (unlikely(offset < 0)) {
- /*
- * The caller is trying to allocate a fragment
- * with fragsz > PAGE_SIZE but the cache isn't big
- * enough to satisfy the request, this may
- * happen in low memory conditions.
- * We don't release the cache page because
- * it could make memory pressure worse
- * so we simply return NULL here.
- */
- return NULL;
- }
+ if (addr != 0) {
+ VM_BUG_ON(!virt_addr_valid((void *)addr));
+ __free_pages(virt_to_page((void *)addr), order);
}
-
- nc->pagecnt_bias--;
- offset &= align_mask;
- nc->offset = offset;
-
- return nc->va + offset;
}
-EXPORT_SYMBOL(__page_frag_alloc_align);
-
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
- */
-void page_frag_free(void *addr)
-{
- struct page *page = virt_to_head_page(addr);
- if (unlikely(put_page_testzero(page)))
- free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(page_frag_free);
+EXPORT_SYMBOL(free_pages);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
@@ -4931,8 +5367,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
struct page *last = page + nr;
split_page_owner(page, order, 0);
- pgalloc_tag_split(page, 1 << order);
- split_page_memcg(page, order, 0);
+ pgalloc_tag_split(page_folio(page), order, 0);
+ split_page_memcg(page, order);
while (page < --last)
set_page_refcounted(last);
@@ -5114,7 +5550,7 @@ static char numa_zonelist_order[] = "Node";
/*
* sysctl handler for numa_zonelist_order
*/
-static int numa_zonelist_order_handler(struct ctl_table *table, int write,
+static int numa_zonelist_order_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
if (write)
@@ -5213,7 +5649,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
}
/*
- * Build gfp_thisnode zonelists
+ * Build __GFP_THISNODE zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
@@ -5227,13 +5663,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
zonerefs->zone_idx = 0;
}
-/*
- * Build zonelists ordered by zone and nodes within zones.
- * This results in conserving DMA zone[s] until all Normal memory is
- * exhausted, but results in overflowing to remote node while memory
- * may still exist in local DMA zone.
- */
-
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
@@ -5282,7 +5711,7 @@ int local_memory_node(int node)
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL);
- return zone_to_nid(z->zone);
+ return zonelist_node_idx(z);
}
#endif
@@ -5445,7 +5874,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- page_group_by_mobility_disabled ? "off" : "on",
+ str_off_on(page_group_by_mobility_disabled),
vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
@@ -5458,15 +5887,14 @@ static int zone_batchsize(struct zone *zone)
int batch;
/*
- * The number of pages to batch allocate is either ~0.1%
- * of the zone or 1MB, whichever is smaller. The batch
+ * The number of pages to batch allocate is either ~0.025%
+ * of the zone or 256KB, whichever is smaller. The batch
* size is striking a balance between allocation latency
* and zone lock contention.
*/
- batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
+ batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE);
+ if (batch <= 1)
+ return 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
@@ -5595,7 +6023,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
pcp->high_min = BOOT_PAGESET_HIGH;
pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_count = 0;
}
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
@@ -5618,7 +6045,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{
int new_high_min, new_high_max, new_batch;
- new_batch = max(1, zone_batchsize(zone));
+ new_batch = zone_batchsize(zone);
if (percpu_pagelist_high_fraction) {
new_high_min = zone_highsize(zone, new_batch, cpu_online,
percpu_pagelist_high_fraction);
@@ -5758,14 +6185,13 @@ __meminit void zone_pcp_init(struct zone *zone)
zone->present_pages, zone_batchsize(zone));
}
+static void setup_per_zone_lowmem_reserve(void);
+
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
totalram_pages_add(count);
-#ifdef CONFIG_HIGHMEM
- if (PageHighMem(page))
- totalhigh_pages_add(count);
-#endif
+ setup_per_zone_lowmem_reserve();
}
EXPORT_SYMBOL(adjust_managed_page_count);
@@ -5805,6 +6231,16 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
+void free_reserved_page(struct page *page)
+{
+ clear_page_tag_ref(page);
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
+ adjust_managed_page_count(page, 1);
+}
+EXPORT_SYMBOL(free_reserved_page);
+
static int page_alloc_cpu_dead(unsigned int cpu)
{
struct zone *zone;
@@ -5875,17 +6311,25 @@ static void calculate_totalreserve_pages(void)
long max = 0;
unsigned long managed_pages = zone_managed_pages(zone);
- /* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ /*
+ * lowmem_reserve[j] is monotonically non-decreasing
+ * in j for a given zone (see
+ * setup_per_zone_lowmem_reserve()). The maximum
+ * valid reserve lives at the highest index with a
+ * non-zero value, so scan backwards and stop at the
+ * first hit.
+ */
+ for (j = MAX_NR_ZONES - 1; j > i; j--) {
+ if (!zone->lowmem_reserve[j])
+ continue;
+ max = zone->lowmem_reserve[j];
+ break;
+ }
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > managed_pages)
- max = managed_pages;
+ max = min_t(unsigned long, max, managed_pages);
pgdat->totalreserve_pages += max;
@@ -5893,6 +6337,7 @@ static void calculate_totalreserve_pages(void)
}
}
totalreserve_pages = reserve_pages;
+ trace_mm_calculate_totalreserve_pages(totalreserve_pages);
}
/*
@@ -5905,7 +6350,21 @@ static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type i, j;
-
+ /*
+ * For a given zone node_zones[i], lowmem_reserve[j] (j > i)
+ * represents how many pages in zone i must effectively be kept
+ * in reserve when deciding whether an allocation class that is
+ * allowed to allocate from zones up to j may fall back into
+ * zone i.
+ *
+ * As j increases, the allocation class can use a strictly larger
+ * set of fallback zones and therefore must not be allowed to
+ * deplete low zones more aggressively than a less flexible one.
+ * As a result, lowmem_reserve[j] is required to be monotonically
+ * non-decreasing in j for each zone i. Callers such as
+ * calculate_totalreserve_pages() rely on this monotonicity when
+ * selecting the maximum reserve entry.
+ */
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES - 1; i++) {
struct zone *zone = &pgdat->node_zones[i];
@@ -5915,14 +6374,15 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
+ trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
+ zone->lowmem_reserve[j]);
}
}
}
@@ -5986,6 +6446,7 @@ static void __setup_per_zone_wmarks(void)
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
+ trace_mm_setup_per_zone_wmarks(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6081,7 +6542,7 @@ postcore_initcall(init_per_zone_wmark_min)
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+static int min_free_kbytes_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6097,7 +6558,7 @@ static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+static int watermark_scale_factor_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6127,7 +6588,7 @@ static void setup_min_unmapped_ratio(void)
}
-static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_unmapped_ratio_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6154,7 +6615,7 @@ static void setup_min_slab_ratio(void)
sysctl_min_slab_ratio) / 100;
}
-static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_slab_ratio_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6178,7 +6639,7 @@ static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int wri
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
+static int lowmem_reserve_ratio_sysctl_handler(const struct ctl_table *table,
int write, void *buffer, size_t *length, loff_t *ppos)
{
int i;
@@ -6199,7 +6660,7 @@ static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
-static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *table,
int write, void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -6232,7 +6693,7 @@ out:
return ret;
}
-static struct ctl_table page_alloc_sysctl_table[] = {
+static const struct ctl_table page_alloc_sysctl_table[] = {
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
@@ -6259,6 +6720,15 @@ static struct ctl_table page_alloc_sysctl_table[] = {
.extra2 = SYSCTL_THREE_THOUSAND,
},
{
+ .procname = "defrag_mode",
+ .data = &defrag_mode,
+ .maxlen = sizeof(defrag_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
.maxlen = sizeof(percpu_pagelist_high_fraction),
@@ -6322,14 +6792,9 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
}
}
-/*
- * [start, end) must belong to a single zone.
- * @migratetype: using migratetype to filter the type of migration in
- * trace_mm_alloc_contig_migrate_range_info.
- */
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype)
+/* [start, end) must belong to a single zone. */
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6338,13 +6803,9 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
int ret = 0;
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = cc->gfp_mask,
.reason = MR_CONTIG_RANGE,
};
- struct page *page;
- unsigned long total_mapped = 0;
- unsigned long total_migrated = 0;
- unsigned long total_reclaimed = 0;
lru_cache_disable();
@@ -6370,22 +6831,9 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
- total_reclaimed += nr_reclaimed;
- list_for_each_entry(page, &cc->migratepages, lru) {
- struct folio *folio = page_folio(page);
-
- total_mapped += folio_mapped(folio) *
- folio_nr_pages(folio);
- }
- }
-
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
- if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
- total_migrated += cc->nr_migratepages;
-
/*
* On -ENOMEM, migrate_pages() bails out right away. It is pointless
* to retry again over this error, so do the same here.
@@ -6401,22 +6849,77 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
putback_movable_pages(&cc->migratepages);
}
- trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
- total_migrated,
- total_reclaimed,
- total_mapped);
return (ret < 0) ? ret : 0;
}
+static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
+{
+ int order;
+
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct page *page, *next;
+ int nr_pages = 1 << order;
+
+ list_for_each_entry_safe(page, next, &list[order], lru) {
+ int i;
+
+ post_alloc_hook(page, order, gfp_mask);
+ set_page_refcounted(page);
+ if (!order)
+ continue;
+
+ split_page(page, order);
+
+ /* Add all subpages to the order-0 head, in sequence. */
+ list_del(&page->lru);
+ for (i = 0; i < nr_pages; i++)
+ list_add_tail(&page[i].lru, &list[0]);
+ }
+ }
+}
+
+static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
+{
+ const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+ __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO;
+ const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+
+ /*
+ * We are given the range to allocate; node, mobility and placement
+ * hints are irrelevant at this point. We'll simply ignore them.
+ */
+ gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE |
+ __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE);
+
+ /*
+ * We only support most reclaim flags (but not NOFAIL/NORETRY), and
+ * selected action flags.
+ */
+ if (gfp_mask & ~(reclaim_mask | action_mask))
+ return -EINVAL;
+
+ /*
+ * Flags to control page compaction/migration/reclaim, to free up our
+ * page range. Migratable pages are movable, __GFP_MOVABLE is implied
+ * for them.
+ *
+ * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
+ * to not degrade callers.
+ */
+ *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) |
+ __GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
+ return 0;
+}
+
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
- * @migratetype: migratetype of the underlying pageblocks (either
- * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
- * in range must have the same migratetype and it must
- * be either of the two.
- * @gfp_mask: GFP mask to use during compaction
+ * @alloc_flags: allocation information
+ * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
*
* The PFN range does not have to be pageblock aligned. The PFN range must
* belong to a single zone.
@@ -6430,8 +6933,9 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
* need to be freed with free_contig_range().
*/
int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- unsigned migratetype, gfp_t gfp_mask)
+ acr_flags_t alloc_flags, gfp_t gfp_mask)
{
+ const unsigned int order = ilog2(end - start);
unsigned long outer_start, outer_end;
int ret = 0;
@@ -6442,10 +6946,24 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.no_set_skip_hint = true,
- .gfp_mask = current_gfp_context(gfp_mask),
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
+ enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?
+ PB_ISOLATE_MODE_CMA_ALLOC :
+ PB_ISOLATE_MODE_OTHER;
+
+ /*
+ * In contrast to the buddy, we allow for orders here that exceed
+ * MAX_PAGE_ORDER, so we must manually make sure that we are not
+ * exceeding the maximum folio order.
+ */
+ if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER))
+ return -EINVAL;
+
+ gfp_mask = current_gfp_context(gfp_mask);
+ if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
+ return -EINVAL;
/*
* What we do here is we mark all pageblocks in range as
@@ -6468,7 +6986,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
+ ret = start_isolate_page_range(start, end, mode);
if (ret)
goto done;
@@ -6484,10 +7002,20 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* allocated. So, if we fall through be sure to clear ret so that
* -EBUSY is not accidentally used or returned to caller.
*/
- ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
+ ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
- ret = 0;
+
+ /*
+ * When in-use hugetlb pages are migrated, they may simply be released
+ * back into the free hugepage pool instead of being returned to the
+ * buddy system. After the migration of in-use huge pages is completed,
+ * we will invoke replace_free_hugepage_folios() to ensure that these
+ * hugepages are properly released to the buddy system.
+ */
+ ret = replace_free_hugepage_folios(start, end);
+ if (ret)
+ goto done;
/*
* Pages from [start, end) are within a pageblock_nr_pages
@@ -6508,7 +7036,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
outer_start = find_large_buddy(start);
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, 0)) {
+ if (test_pages_isolated(outer_start, end, mode)) {
ret = -EBUSY;
goto done;
}
@@ -6520,14 +7048,27 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
goto done;
}
- /* Free head and tail (if any) */
- if (start != outer_start)
- free_contig_range(outer_start, start - outer_start);
- if (end != outer_end)
- free_contig_range(end, outer_end - end);
+ if (!(gfp_mask & __GFP_COMP)) {
+ split_free_pages(cc.freepages, gfp_mask);
+
+ /* Free head and tail (if any) */
+ if (start != outer_start)
+ free_contig_range(outer_start, start - outer_start);
+ if (end != outer_end)
+ free_contig_range(end, outer_end - end);
+ } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
+ struct page *head = pfn_to_page(start);
+ check_new_pages(head, order);
+ prep_new_page(head, order, gfp_mask, 0);
+ set_page_refcounted(head);
+ } else {
+ ret = -EINVAL;
+ WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
+ start, end, outer_start, outer_end);
+ }
done:
- undo_isolate_page_range(start, end, migratetype);
+ undo_isolate_page_range(start, end);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range_noprof);
@@ -6537,8 +7078,8 @@ static int __alloc_contig_pages(unsigned long start_pfn,
{
unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
+ return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
+ gfp_mask);
}
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
@@ -6575,7 +7116,9 @@ static bool zone_spans_last_pfn(const struct zone *zone,
/**
* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
* @nr_pages: Number of contiguous pages to allocate
- * @gfp_mask: GFP mask to limit search and used during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
* @nid: Target node
* @nodemask: Mask for other possible nodes
*
@@ -6634,6 +7177,18 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
void free_contig_range(unsigned long pfn, unsigned long nr_pages)
{
unsigned long count = 0;
+ struct folio *folio = pfn_folio(pfn);
+
+ if (folio_test_large(folio)) {
+ int expected = folio_nr_pages(folio);
+
+ if (nr_pages == expected)
+ folio_put(folio);
+ else
+ WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
+ pfn, nr_pages, expected);
+ return;
+ }
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -6690,14 +7245,19 @@ void zone_pcp_reset(struct zone *zone)
/*
* All pages in the range must be in a single zone, must not contain holes,
* must span full sections, and must be isolated before calling this function.
+ *
+ * Returns the number of managed (non-PageOffline()) pages in the range: the
+ * number of pages for which memory offlining code must adjust managed page
+ * counters using adjust_managed_page_count().
*/
-void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+unsigned long __offline_isolated_pages(unsigned long start_pfn,
+ unsigned long end_pfn)
{
+ unsigned long already_offline = 0, flags;
unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
- unsigned long flags;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
@@ -6719,6 +7279,7 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
if (PageOffline(page)) {
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
+ already_offline++;
pfn++;
continue;
}
@@ -6731,6 +7292,8 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
+
+ return end_pfn - start_pfn - already_offline;
}
#endif
@@ -6872,9 +7435,6 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
static int __init accept_memory_parse(char *p)
@@ -6894,26 +7454,43 @@ early_param("accept_memory", accept_memory_parse);
static bool page_contains_unaccepted(struct page *page, unsigned int order)
{
phys_addr_t start = page_to_phys(page);
- phys_addr_t end = start + (PAGE_SIZE << order);
- return range_contains_unaccepted_memory(start, end);
+ return range_contains_unaccepted_memory(start, PAGE_SIZE << order);
}
-static void accept_page(struct page *page, unsigned int order)
+static void __accept_page(struct zone *zone, unsigned long *flags,
+ struct page *page)
{
- phys_addr_t start = page_to_phys(page);
+ list_del(&page->lru);
+ account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ __ClearPageUnaccepted(page);
+ spin_unlock_irqrestore(&zone->lock, *flags);
+
+ accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
+
+ __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
+}
+
+void accept_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long flags;
- accept_memory(start, start + (PAGE_SIZE << order));
+ spin_lock_irqsave(&zone->lock, flags);
+ if (!PageUnaccepted(page)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return;
+ }
+
+ /* Unlocks zone->lock */
+ __accept_page(zone, &flags, page);
}
static bool try_to_accept_memory_one(struct zone *zone)
{
unsigned long flags;
struct page *page;
- bool last;
-
- if (list_empty(&zone->unaccepted_pages))
- return false;
spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
@@ -6923,68 +7500,66 @@ static bool try_to_accept_memory_one(struct zone *zone)
return false;
}
- list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
+ /* Unlocks zone->lock */
+ __accept_page(zone, &flags, page);
- account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
- __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
- spin_unlock_irqrestore(&zone->lock, flags);
+ return true;
+}
- accept_page(page, MAX_PAGE_ORDER);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
+{
+ long to_accept, wmark;
+ bool ret = false;
- __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (alloc_flags & ALLOC_TRYLOCK)
+ return false;
- return true;
-}
+ wmark = promo_wmark_pages(zone);
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
-{
- long to_accept;
- int ret = false;
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
- /* How much to accept to get to high watermark? */
- to_accept = high_wmark_pages(zone) -
+ /* How much to accept to get to promo watermark? */
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
- __zone_watermark_unusable_free(zone, order, 0));
+ __zone_watermark_unusable_free(zone, order, 0) -
+ zone_page_state(zone, NR_UNACCEPTED));
- /* Accept at least one page */
- do {
+ while (to_accept > 0) {
if (!try_to_accept_memory_one(zone))
break;
ret = true;
to_accept -= MAX_ORDER_NR_PAGES;
- } while (to_accept > 0);
+ }
return ret;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ __SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
@@ -6995,16 +7570,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
return false;
}
-static void accept_page(struct page *page, unsigned int order)
-{
-}
-
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
-{
- return false;
-}
-
-static inline bool has_unaccepted_memory(void)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
return false;
}
@@ -7016,3 +7583,102 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+
+struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
+{
+ /*
+ * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
+ * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
+ * is not safe in arbitrary context.
+ *
+ * These two are the conditions for gfpflags_allow_spinning() being true.
+ *
+ * Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
+ * to warn. Also warn would trigger printk() which is unsafe from
+ * various contexts. We cannot use printk_deferred_enter() to mitigate,
+ * since the running context is unknown.
+ *
+ * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
+ * is safe in any context. Also zeroing the page is mandatory for
+ * BPF use cases.
+ *
+ * Though __GFP_NOMEMALLOC is not checked in the code path below,
+ * specify it here to highlight that alloc_pages_nolock()
+ * doesn't want to deplete reserves.
+ */
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP
+ | gfp_flags;
+ unsigned int alloc_flags = ALLOC_TRYLOCK;
+ struct alloc_context ac = { };
+ struct page *page;
+
+ VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT);
+ /*
+ * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
+ * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
+ * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
+ * mark the task as the owner of another rt_spin_lock which will
+ * confuse PI logic, so return immediately if called form hard IRQ or
+ * NMI.
+ *
+ * Note, irqs_disabled() case is ok. This function can be called
+ * from raw_spin_lock_irqsave region.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ return NULL;
+ if (!pcp_allowed_order(order))
+ return NULL;
+
+ /* Bailout, since _deferred_grow_zone() needs to take a lock */
+ if (deferred_pages_enabled())
+ return NULL;
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_node_id();
+
+ prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+ &alloc_gfp, &alloc_flags);
+
+ /*
+ * Best effort allocation from percpu free list.
+ * If it's empty attempt to spin_trylock zone->lock.
+ */
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+ /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+ if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) &&
+ unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
+ __free_frozen_pages(page, order, FPI_TRYLOCK);
+ page = NULL;
+ }
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
+ return page;
+}
+/**
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
+ * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed.
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
+ */
+struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
+{
+ struct page *page;
+
+ page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);