summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c219
1 files changed, 135 insertions, 84 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4074c07d02ca..822e05f1a964 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -99,9 +99,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
/*
* On SMP, spin_trylock is sufficient protection.
* On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ * Pass flags to a no-op inline function to typecheck and silence the unused
+ * variable warning.
*/
-#define pcp_trylock_prepare(flags) do { } while (0)
-#define pcp_trylock_finish(flag) do { } while (0)
+static inline void __pcp_trylock_noop(unsigned long *flags) { }
+#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags))
+#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags))
#else
/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
@@ -129,15 +132,6 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
* Generic helper to lookup and a per-cpu variable with an embedded spinlock.
* Return value should be used with equivalent unlock helper.
*/
-#define pcpu_spin_lock(type, member, ptr) \
-({ \
- type *_ret; \
- pcpu_task_pin(); \
- _ret = this_cpu_ptr(ptr); \
- spin_lock(&_ret->member); \
- _ret; \
-})
-
#define pcpu_spin_trylock(type, member, ptr) \
({ \
type *_ret; \
@@ -157,14 +151,21 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
})
/* struct per_cpu_pages specific helpers. */
-#define pcp_spin_lock(ptr) \
- pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
-
-#define pcp_spin_trylock(ptr) \
- pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
+#define pcp_spin_trylock(ptr, UP_flags) \
+({ \
+ struct per_cpu_pages *__ret; \
+ pcp_trylock_prepare(UP_flags); \
+ __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \
+ if (!__ret) \
+ pcp_trylock_finish(UP_flags); \
+ __ret; \
+})
-#define pcp_spin_unlock(ptr) \
- pcpu_spin_unlock(lock, ptr)
+#define pcp_spin_unlock(ptr, UP_flags) \
+({ \
+ pcpu_spin_unlock(lock, ptr); \
+ pcp_trylock_finish(UP_flags); \
+})
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -2552,10 +2553,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to decay the PCP high.
* Return whether there are addition works to do.
*/
-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
- int high_min, to_drain, batch;
- int todo = 0;
+ int high_min, to_drain, to_drain_batched, batch;
+ bool todo = false;
high_min = READ_ONCE(pcp->high_min);
batch = READ_ONCE(pcp->batch);
@@ -2568,15 +2569,18 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
pcp->high - (pcp->high >> 3), high_min);
if (pcp->high > high_min)
- todo++;
+ todo = true;
}
to_drain = pcp->count - pcp->high;
- if (to_drain > 0) {
+ while (to_drain > 0) {
+ to_drain_batched = min(to_drain, batch);
spin_lock(&pcp->lock);
- free_pcppages_bulk(zone, to_drain, pcp, 0);
+ free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
spin_unlock(&pcp->lock);
- todo++;
+ todo = true;
+
+ to_drain -= to_drain_batched;
}
return todo;
@@ -2810,12 +2814,22 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return high;
}
-static void free_frozen_page_commit(struct zone *zone,
+/*
+ * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the
+ * pcp's watermarks below high.
+ *
+ * May return a freed pcp, if during page freeing the pcp spinlock cannot be
+ * reacquired. Return true if pcp is locked, false otherwise.
+ */
+static bool free_frozen_page_commit(struct zone *zone,
struct per_cpu_pages *pcp, struct page *page, int migratetype,
- unsigned int order, fpi_t fpi_flags)
+ unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
{
int high, batch;
+ int to_free, to_free_batched;
int pindex;
+ int cpu = smp_processor_id();
+ int ret = true;
bool free_high = false;
/*
@@ -2853,15 +2867,42 @@ static void free_frozen_page_commit(struct zone *zone,
* Do not attempt to take a zone lock. Let pcp->count get
* over high mark temporarily.
*/
- return;
+ return true;
}
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count < high)
- return;
+ return true;
+
+ to_free = nr_pcp_free(pcp, batch, high, free_high);
+ while (to_free > 0 && pcp->count > 0) {
+ to_free_batched = min(to_free, batch);
+ free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
+ to_free -= to_free_batched;
+
+ if (to_free == 0 || pcp->count == 0)
+ break;
+
+ pcp_spin_unlock(pcp, *UP_flags);
+
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags);
+ if (!pcp) {
+ ret = false;
+ break;
+ }
+
+ /*
+ * Check if this thread has been migrated to a different CPU.
+ * If that is the case, give up and indicate that the pcp is
+ * returned in an unlocked state.
+ */
+ if (smp_processor_id() != cpu) {
+ pcp_spin_unlock(pcp, *UP_flags);
+ ret = false;
+ break;
+ }
+ }
- free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
- pcp, pindex);
if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
zone_watermark_ok(zone, 0, high_wmark_pages(zone),
ZONE_MOVABLE, 0)) {
@@ -2879,6 +2920,7 @@ static void free_frozen_page_commit(struct zone *zone,
next_memory_node(pgdat->node_id) < MAX_NUMNODES)
atomic_set(&pgdat->kswapd_failures, 0);
}
+ return ret;
}
/*
@@ -2887,7 +2929,7 @@ static void free_frozen_page_commit(struct zone *zone,
static void __free_frozen_pages(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
@@ -2923,15 +2965,15 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
add_page_to_zone_llist(zone, page, order);
return;
}
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (pcp) {
- free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
- pcp_spin_unlock(pcp);
+ if (!free_frozen_page_commit(zone, pcp, page, migratetype,
+ order, fpi_flags, &UP_flags))
+ return;
+ pcp_spin_unlock(pcp, UP_flags);
} else {
free_one_page(zone, page, pfn, order, fpi_flags);
}
- pcp_trylock_finish(UP_flags);
}
void free_frozen_pages(struct page *page, unsigned int order)
@@ -2944,7 +2986,7 @@ void free_frozen_pages(struct page *page, unsigned int order)
*/
void free_unref_folios(struct folio_batch *folios)
{
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
int i, j;
@@ -2987,8 +3029,7 @@ void free_unref_folios(struct folio_batch *folios)
if (zone != locked_zone ||
is_migrate_isolate(migratetype)) {
if (pcp) {
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
locked_zone = NULL;
pcp = NULL;
}
@@ -3007,10 +3048,8 @@ void free_unref_folios(struct folio_batch *folios)
* trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion.
*/
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (unlikely(!pcp)) {
- pcp_trylock_finish(UP_flags);
free_one_page(zone, &folio->page, pfn,
order, FPI_NONE);
continue;
@@ -3026,14 +3065,15 @@ void free_unref_folios(struct folio_batch *folios)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
- free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
- order, FPI_NONE);
+ if (!free_frozen_page_commit(zone, pcp, &folio->page,
+ migratetype, order, FPI_NONE, &UP_flags)) {
+ pcp = NULL;
+ locked_zone = NULL;
+ }
}
- if (pcp) {
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
- }
+ if (pcp)
+ pcp_spin_unlock(pcp, UP_flags);
folio_batch_reinit(folios);
}
@@ -3284,15 +3324,12 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
- if (!pcp) {
- pcp_trylock_finish(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+ if (!pcp)
return NULL;
- }
/*
* On allocation, reduce the number of pages that are batch freed.
@@ -3302,8 +3339,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
@@ -3936,6 +3972,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
filter &= ~SHOW_MEM_FILTER_NODES;
__show_mem(filter, nodemask, gfp_zone(gfp_mask));
+ mem_cgroup_show_protected_memory(NULL);
}
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -4643,11 +4680,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
if (unlikely(nofail)) {
/*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- /*
* Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
* otherwise, we may result in lockup.
*/
@@ -4995,7 +5027,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct page **page_array)
{
struct page *page;
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct zone *zone;
struct zoneref *z;
struct per_cpu_pages *pcp;
@@ -5089,10 +5121,9 @@ retry_this_zone:
goto failed;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (!pcp)
- goto failed_irq;
+ goto failed;
/* Attempt the batch allocation */
pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
@@ -5109,8 +5140,8 @@ retry_this_zone:
if (unlikely(!page)) {
/* Try and allocate at least one page */
if (!nr_account) {
- pcp_spin_unlock(pcp);
- goto failed_irq;
+ pcp_spin_unlock(pcp, UP_flags);
+ goto failed;
}
break;
}
@@ -5121,8 +5152,7 @@ retry_this_zone:
page_array[nr_populated++] = page;
}
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
@@ -5130,9 +5160,6 @@ retry_this_zone:
out:
return nr_populated;
-failed_irq:
- pcp_trylock_finish(UP_flags);
-
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
if (page)
@@ -5860,15 +5887,14 @@ static int zone_batchsize(struct zone *zone)
int batch;
/*
- * The number of pages to batch allocate is either ~0.1%
- * of the zone or 1MB, whichever is smaller. The batch
+ * The number of pages to batch allocate is either ~0.025%
+ * of the zone or 256KB, whichever is smaller. The batch
* size is striking a balance between allocation latency
* and zone lock contention.
*/
- batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
+ batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE);
+ if (batch <= 1)
+ return 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
@@ -6019,7 +6045,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{
int new_high_min, new_high_max, new_batch;
- new_batch = max(1, zone_batchsize(zone));
+ new_batch = zone_batchsize(zone);
if (percpu_pagelist_high_fraction) {
new_high_min = zone_highsize(zone, new_batch, cpu_online,
percpu_pagelist_high_fraction);
@@ -6285,10 +6311,21 @@ static void calculate_totalreserve_pages(void)
long max = 0;
unsigned long managed_pages = zone_managed_pages(zone);
- /* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++)
- max = max(max, zone->lowmem_reserve[j]);
+ /*
+ * lowmem_reserve[j] is monotonically non-decreasing
+ * in j for a given zone (see
+ * setup_per_zone_lowmem_reserve()). The maximum
+ * valid reserve lives at the highest index with a
+ * non-zero value, so scan backwards and stop at the
+ * first hit.
+ */
+ for (j = MAX_NR_ZONES - 1; j > i; j--) {
+ if (!zone->lowmem_reserve[j])
+ continue;
+ max = zone->lowmem_reserve[j];
+ break;
+ }
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
@@ -6313,7 +6350,21 @@ static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type i, j;
-
+ /*
+ * For a given zone node_zones[i], lowmem_reserve[j] (j > i)
+ * represents how many pages in zone i must effectively be kept
+ * in reserve when deciding whether an allocation class that is
+ * allowed to allocate from zones up to j may fall back into
+ * zone i.
+ *
+ * As j increases, the allocation class can use a strictly larger
+ * set of fallback zones and therefore must not be allowed to
+ * deplete low zones more aggressively than a less flexible one.
+ * As a result, lowmem_reserve[j] is required to be monotonically
+ * non-decreasing in j for each zone i. Callers such as
+ * calculate_totalreserve_pages() rely on this monotonicity when
+ * selecting the maximum reserve entry.
+ */
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES - 1; i++) {
struct zone *zone = &pgdat->node_zones[i];