diff options
Diffstat (limited to 'mm/compaction.c')
-rw-r--r-- | mm/compaction.c | 453 |
1 files changed, 260 insertions, 193 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index b961db601df4..a3203d97123e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -23,6 +23,7 @@ #include <linux/freezer.h> #include <linux/page_owner.h> #include <linux/psi.h> +#include <linux/cpuset.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -40,9 +41,22 @@ static inline void count_compact_events(enum vm_event_item item, long delta) { count_vm_events(item, delta); } + +/* + * order == -1 is expected when compacting proactively via + * 1. /proc/sys/vm/compact_memory + * 2. /sys/devices/system/node/nodex/compact + * 3. /proc/sys/vm/compaction_proactiveness + */ +static inline bool is_via_compact_memory(int order) +{ + return order == -1; +} + #else #define count_compact_event(item) do { } while (0) #define count_compact_events(item, delta) do { } while (0) +static inline bool is_via_compact_memory(int order) { return false; } #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -66,45 +80,37 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif -static unsigned long release_freepages(struct list_head *freelist) +static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { - struct page *page, *next; - unsigned long high_pfn = 0; - - list_for_each_entry_safe(page, next, freelist, lru) { - unsigned long pfn = page_to_pfn(page); - list_del(&page->lru); - __free_page(page); - if (pfn > high_pfn) - high_pfn = pfn; - } - - return high_pfn; + post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); + return page; } +#define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) -static void split_map_pages(struct list_head *list) +static unsigned long release_free_list(struct list_head *freepages) { - unsigned int i, order, nr_pages; - struct page *page, *next; - LIST_HEAD(tmp_list); - - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); + int order; + unsigned long high_pfn = 0; - order = page_private(page); - nr_pages = 1 << order; + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; - post_alloc_hook(page, order, __GFP_MOVABLE); - if (order) - split_page(page, order); + list_for_each_entry_safe(page, next, &freepages[order], lru) { + unsigned long pfn = page_to_pfn(page); - for (i = 0; i < nr_pages; i++) { - list_add(&page->lru, &tmp_list); - page++; + list_del(&page->lru); + /* + * Convert free pages into post allocation pages, so + * that we can free them via __free_page. + */ + mark_allocated(page, order, __GFP_MOVABLE); + __free_pages(page, order); + if (pfn > high_pfn) + high_pfn = pfn; } } - - list_splice(&tmp_list, list); + return high_pfn; } #ifdef CONFIG_COMPACTION @@ -625,7 +631,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (blockpfn + (1UL << order) <= end_pfn) { + if ((order <= MAX_PAGE_ORDER) && + (blockpfn + (1UL << order) <= end_pfn)) { blockpfn += (1UL << order) - 1; page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; @@ -657,7 +664,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned += isolated - 1; total_isolated += isolated; cc->nr_freepages += isolated; - list_add_tail(&page->lru, freelist); + list_add_tail(&page->lru, &freelist[order]); if (!strict && cc->nr_migratepages <= cc->nr_freepages) { blockpfn += isolated; @@ -711,18 +718,21 @@ isolate_fail: * * Non-free pages, invalid PFNs, or zone boundaries within the * [start_pfn, end_pfn) range are considered errors, cause function to - * undo its actions and return zero. + * undo its actions and return zero. cc->freepages[] are empty. * * Otherwise, function returns one-past-the-last PFN of isolated page * (which may be greater then end_pfn if end fell in a middle of - * a free page). + * a free page). cc->freepages[] contain free pages isolated. */ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long isolated, pfn, block_start_pfn, block_end_pfn; - LIST_HEAD(freelist); + int order; + + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&cc->freepages[order]); pfn = start_pfn; block_start_pfn = pageblock_start_pfn(pfn); @@ -753,7 +763,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, &freelist, 0, true); + block_end_pfn, cc->freepages, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -770,12 +780,9 @@ isolate_freepages_range(struct compact_control *cc, */ } - /* __isolate_free_page() does not map the pages */ - split_map_pages(&freelist); - if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ - release_freepages(&freelist); + release_free_list(cc->freepages); return 0; } @@ -817,6 +824,32 @@ static bool too_many_isolated(struct compact_control *cc) } /** + * skip_isolation_on_order() - determine when to skip folio isolation based on + * folio order and compaction target order + * @order: to-be-isolated folio order + * @target_order: compaction target order + * + * This avoids unnecessary folio isolations during compaction. + */ +static bool skip_isolation_on_order(int order, int target_order) +{ + /* + * Unless we are performing global compaction (i.e., + * is_via_compact_memory), skip any folios that are larger than the + * target order: we wouldn't be here if we'd have a free folio with + * the desired target_order, so migrating this folio would likely fail + * later. + */ + if (!is_via_compact_memory(target_order) && order >= target_order) + return true; + /* + * We limit memory compaction to pageblocks and won't try + * creating free blocks of memory that are larger than that. + */ + return order >= pageblock_order; +} + +/** * isolate_migratepages_block() - isolate all migrate-able pages within * a single pageblock * @cc: Compaction control structure. @@ -947,7 +980,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, valid_page = page; } - if (PageHuge(page) && cc->alloc_contig) { + if (PageHuge(page)) { + /* + * skip hugetlbfs if we are not compacting for pages + * bigger than its order. THPs and other compound pages + * are handled below. + */ + if (!cc->alloc_contig) { + const unsigned int order = compound_order(page); + + if (order <= MAX_PAGE_ORDER) { + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; + } + goto isolate_fail; + } + /* for alloc_contig case */ if (locked) { unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; @@ -1008,21 +1056,24 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Regardless of being on LRU, compound pages such as THP and - * hugetlbfs are not to be compacted unless we are attempting - * an allocation much larger than the huge page size (eg CMA). - * We can potentially save a lot of iterations if we skip them - * at once. The check is racy, but we can consider only valid - * values and the only danger is skipping too much. + * Regardless of being on LRU, compound pages such as THP + * (hugetlbfs is handled above) are not to be compacted unless + * we are attempting an allocation larger than the compound + * page size. We can potentially save a lot of iterations if we + * skip them at once. The check is racy, but we can consider + * only valid values and the only danger is skipping too much. */ if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_PAGE_ORDER)) { - low_pfn += (1UL << order) - 1; - nr_scanned += (1UL << order) - 1; + /* Skip based on page order and compaction target order. */ + if (skip_isolation_on_order(order, cc->order)) { + if (order <= MAX_PAGE_ORDER) { + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; + } + goto isolate_fail; } - goto isolate_fail; } /* @@ -1100,22 +1151,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) || (mapping && is_unevictable)) { bool migrate_dirty = true; - bool is_unmovable; + bool is_inaccessible; /* * Only folios without mappings or that have * a ->migrate_folio callback are possible to migrate * without blocking. * - * Folios from unmovable mappings are not migratable. + * Folios from inaccessible mappings are not migratable. * * However, we can be racing with truncation, which can * free the mapping that we need to check. Truncation * holds the folio lock until after the folio is removed * from the page so holding it ourselves is sufficient. * - * To avoid locking the folio just to check unmovable, - * assume every unmovable folio is also unevictable, + * To avoid locking the folio just to check inaccessible, + * assume every inaccessible folio is also unevictable, * which is a cheaper test. If our assumption goes * wrong, it's not a correctness bug, just potentially * wasted cycles. @@ -1128,9 +1179,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, migrate_dirty = !mapping || mapping->a_ops->migrate_folio; } - is_unmovable = mapping && mapping_unmovable(mapping); + is_inaccessible = mapping && mapping_inaccessible(mapping); folio_unlock(folio); - if (!migrate_dirty || is_unmovable) + if (!migrate_dirty || is_inaccessible) goto isolate_fail_put; } @@ -1165,10 +1216,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * folio become large since the non-locked check, - * and it's on LRU. + * Check LRU folio order under the lock */ - if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) { + if (unlikely(skip_isolation_on_order(folio_order(folio), + cc->order) && + !cc->alloc_contig)) { low_pfn += folio_nr_pages(folio) - 1; nr_scanned += folio_nr_pages(folio) - 1; folio_set_lru(folio); @@ -1365,12 +1417,14 @@ static bool suitable_migration_target(struct compact_control *cc, { /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { + int order = cc->order > 0 ? cc->order : pageblock_order; + /* * We are checking page_order without zone->lock taken. But * the only small danger is that we skip a potentially suitable * pageblock, so it's not worth to check order for valid range. */ - if (buddy_order_unsafe(page) >= pageblock_order) + if (buddy_order_unsafe(page) >= order) return false; } @@ -1458,7 +1512,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) if (!page) return; - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (start_pfn == end_pfn && !cc->no_set_skip_hint) @@ -1587,7 +1641,7 @@ static void fast_isolate_freepages(struct compact_control *cc) nr_scanned += nr_isolated - 1; total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; - list_add_tail(&page->lru, &cc->freepages); + list_add_tail(&page->lru, &cc->freepages[order]); count_compact_events(COMPACTISOLATED, nr_isolated); } else { /* If isolation fails, abort the search */ @@ -1664,13 +1718,12 @@ static void isolate_freepages(struct compact_control *cc) unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - struct list_head *freelist = &cc->freepages; unsigned int stride; /* Try a small search of the free lists for a candidate */ fast_isolate_freepages(cc); if (cc->nr_freepages) - goto splitmap; + return; /* * Initialise the free scanner. The starting point is where we last @@ -1730,7 +1783,7 @@ static void isolate_freepages(struct compact_control *cc) /* Found a block suitable for isolating free pages from. */ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, stride, false); + block_end_pfn, cc->freepages, stride, false); /* Update the skip hint if the full pageblock was scanned */ if (isolate_start_pfn == block_end_pfn) @@ -1771,33 +1824,63 @@ static void isolate_freepages(struct compact_control *cc) * and the loop terminated due to isolate_start_pfn < low_pfn */ cc->free_pfn = isolate_start_pfn; - -splitmap: - /* __isolate_free_page() does not map the pages */ - split_map_pages(freelist); } /* * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct folio *compaction_alloc(struct folio *src, unsigned long data) +static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct folio *dst; + int order = folio_order(src); + bool has_isolated_pages = false; + int start_order; + struct page *freepage; + unsigned long size; + +again: + for (start_order = order; start_order < NR_PAGE_ORDERS; start_order++) + if (!list_empty(&cc->freepages[start_order])) + break; - if (list_empty(&cc->freepages)) { + /* no free pages in the list */ + if (start_order == NR_PAGE_ORDERS) { + if (has_isolated_pages) + return NULL; isolate_freepages(cc); + has_isolated_pages = true; + goto again; + } - if (list_empty(&cc->freepages)) - return NULL; + freepage = list_first_entry(&cc->freepages[start_order], struct page, + lru); + size = 1 << start_order; + + list_del(&freepage->lru); + + while (start_order > order) { + start_order--; + size >>= 1; + + list_add(&freepage[size].lru, &cc->freepages[start_order]); + set_page_private(&freepage[size], start_order); } + dst = (struct folio *)freepage; - dst = list_entry(cc->freepages.next, struct folio, lru); - list_del(&dst->lru); - cc->nr_freepages--; + post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + set_page_refcounted(&dst->page); + if (order) + prep_compound_page(&dst->page, order); + cc->nr_freepages -= 1 << order; + cc->nr_migratepages -= 1 << order; + return page_rmappable_folio(&dst->page); +} - return dst; +static struct folio *compaction_alloc(struct folio *src, unsigned long data) +{ + return alloc_hooks(compaction_alloc_noprof(src, data)); } /* @@ -1808,9 +1891,19 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; + int order = folio_order(dst); + struct page *page = &dst->page; - list_add(&dst->lru, &cc->freepages); - cc->nr_freepages++; + if (folio_put_testzero(dst)) { + free_pages_prepare(page, order); + list_add(&dst->lru, &cc->freepages[order]); + cc->nr_freepages += 1 << order; + } + cc->nr_migratepages += 1 << order; + /* + * someone else has referenced the page, we cannot take it back to our + * free list. + */ } /* possible outcome of isolate_migratepages */ @@ -2087,17 +2180,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* - * order == -1 is expected when compacting proactively via - * 1. /proc/sys/vm/compact_memory - * 2. /sys/devices/system/node/nodex/compact - * 3. /proc/sys/vm/compaction_proactiveness - */ -static inline bool is_via_compact_memory(int order) -{ - return order == -1; -} - -/* * Determine whether kswapd is (or recently was!) running on this node. * * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't @@ -2409,7 +2491,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ static enum compact_result compaction_suit_allocation_order(struct zone *zone, unsigned int order, - int highest_zoneidx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags, + bool async) { unsigned long watermark; @@ -2418,6 +2501,23 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order, alloc_flags)) return COMPACT_SUCCESS; + /* + * For unmovable allocations (without ALLOC_CMA), check if there is enough + * free memory in the non-CMA pageblocks. Otherwise compaction could form + * the high-order page in CMA pageblocks, which would not help the + * allocation to succeed. However, limit the check to costly order async + * compaction (such as opportunistic THP attempts) because there is the + * possibility that compaction would migrate pages from non-CMA to CMA + * pageblock. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && async && + !(alloc_flags & ALLOC_CMA)) { + watermark = low_wmark_pages(zone) + compact_gap(order); + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + 0, zone_page_state(zone, NR_FREE_PAGES))) + return COMPACT_SKIPPED; + } + if (!compaction_suitable(zone, order, highest_zoneidx)) return COMPACT_SKIPPED; @@ -2433,7 +2533,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; - unsigned int nr_succeeded = 0; + unsigned int nr_succeeded = 0, nr_migratepages; + int order; /* * These counters track activities during zone compaction. Initialize @@ -2443,7 +2544,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->total_free_scanned = 0; cc->nr_migratepages = 0; cc->nr_freepages = 0; - INIT_LIST_HEAD(&cc->freepages); + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&cc->freepages[order]); INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); @@ -2451,7 +2553,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) if (!is_via_compact_memory(cc->order)) { ret = compaction_suit_allocation_order(cc->zone, cc->order, cc->highest_zoneidx, - cc->alloc_flags); + cc->alloc_flags, + cc->mode == MIGRATE_ASYNC); if (ret != COMPACT_CONTINUE) return ret; } @@ -2551,11 +2654,17 @@ rescan: pageblock_start_pfn(cc->migrate_pfn - 1)); } + /* + * Record the number of pages to migrate since the + * compaction_alloc/free() will update cc->nr_migratepages + * properly. + */ + nr_migratepages = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc, nr_succeeded); + trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; @@ -2629,7 +2738,7 @@ out: * so we don't leave any returned pages behind in the next attempt. */ if (cc->nr_freepages > 0) { - unsigned long free_pfn = release_freepages(&cc->freepages); + unsigned long free_pfn = release_free_list(cc->freepages); cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); @@ -2648,7 +2757,6 @@ out: trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); - VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); return ret; @@ -2737,6 +2845,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, ac->highest_zoneidx, ac->nodemask) { enum compact_result status; + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + if (prio > MIN_COMPACT_PRIORITY && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); @@ -2783,25 +2896,27 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } /* - * Compact all zones within a node till each zone's fragmentation score - * reaches within proactive compaction thresholds (as determined by the - * proactiveness tunable). + * compact_node() - compact all zones within a node + * @pgdat: The node page data + * @proactive: Whether the compaction is proactive * - * It is possible that the function returns before reaching score targets - * due to various back-off conditions, such as, contention on per-node or - * per-zone locks. + * For proactive compaction, compact till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable), it is possible that the function returns before + * reaching score targets due to various back-off conditions, such as, + * contention on per-node or per-zone locks. */ -static void proactive_compact_node(pg_data_t *pgdat) +static int compact_node(pg_data_t *pgdat, bool proactive) { int zoneid; struct zone *zone; struct compact_control cc = { .order = -1, - .mode = MIGRATE_SYNC_LIGHT, + .mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, .gfp_mask = GFP_KERNEL, - .proactive_compaction = true, + .proactive_compaction = proactive, }; for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -2809,57 +2924,42 @@ static void proactive_compact_node(pg_data_t *pgdat) if (!populated_zone(zone)) continue; + if (fatal_signal_pending(current)) + return -EINTR; + cc.zone = zone; compact_zone(&cc, NULL); - count_compact_events(KCOMPACTD_MIGRATE_SCANNED, - cc.total_migrate_scanned); - count_compact_events(KCOMPACTD_FREE_SCANNED, - cc.total_free_scanned); + if (proactive) { + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); + } } -} - -/* Compact all zones within a node */ -static void compact_node(int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - int zoneid; - struct zone *zone; - struct compact_control cc = { - .order = -1, - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - .whole_zone = true, - .gfp_mask = GFP_KERNEL, - }; - - - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - - zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - cc.zone = zone; - - compact_zone(&cc, NULL); - } + return 0; } -/* Compact all nodes in the system */ -static void compact_nodes(void) +/* Compact all zones of all nodes in the system */ +static int compact_nodes(void) { - int nid; + int ret, nid; /* Flush pending updates to the LRU lists */ lru_add_drain_all(); - for_each_online_node(nid) - compact_node(nid); + for_each_online_node(nid) { + ret = compact_node(NODE_DATA(nid), false); + if (ret) + return ret; + } + + return 0; } -static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, +static int compaction_proactiveness_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc, nid; @@ -2889,7 +2989,7 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory */ -static int sysctl_compaction_handler(struct ctl_table *table, int write, +static int sysctl_compaction_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; @@ -2902,9 +3002,9 @@ static int sysctl_compaction_handler(struct ctl_table *table, int write, return -EINVAL; if (write) - compact_nodes(); + ret = compact_nodes(); - return 0; + return ret; } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) @@ -2918,7 +3018,7 @@ static ssize_t compact_store(struct device *dev, /* Flush pending updates to the LRU lists */ lru_add_drain_all(); - compact_node(nid); + compact_node(NODE_DATA(nid), false); } return count; @@ -2957,7 +3057,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) ret = compaction_suit_allocation_order(zone, pgdat->kcompactd_max_order, - highest_zoneidx, ALLOC_WMARK_MIN); + highest_zoneidx, ALLOC_WMARK_MIN, + false); if (ret == COMPACT_CONTINUE) return true; } @@ -2998,7 +3099,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) continue; ret = compaction_suit_allocation_order(zone, - cc.order, zoneid, ALLOC_WMARK_MIN); + cc.order, zoneid, ALLOC_WMARK_MIN, + false); if (ret != COMPACT_CONTINUE) continue; @@ -3076,15 +3178,10 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t *)p; - struct task_struct *tsk = current; long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); long timeout = default_timeout; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); - + current->flags |= PF_KCOMPACTD; set_freezable(); pgdat->kcompactd_max_order = 0; @@ -3127,7 +3224,7 @@ static int kcompactd(void *p) unsigned int prev_score, score; prev_score = fragmentation_score_node(pgdat); - proactive_compact_node(pgdat); + compact_node(pgdat, true); score = fragmentation_score_node(pgdat); /* * Defer proactive compaction if the fragmentation @@ -3141,6 +3238,8 @@ static int kcompactd(void *p) pgdat->proactive_compact_trigger = false; } + current->flags &= ~PF_KCOMPACTD; + return 0; } @@ -3155,10 +3254,12 @@ void __meminit kcompactd_run(int nid) if (pgdat->kcompactd) return; - pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + pgdat->kcompactd = kthread_create_on_node(kcompactd, pgdat, nid, "kcompactd%d", nid); if (IS_ERR(pgdat->kcompactd)) { pr_err("Failed to start kcompactd on node %d\n", nid); pgdat->kcompactd = NULL; + } else { + wake_up_process(pgdat->kcompactd); } } @@ -3176,31 +3277,7 @@ void __meminit kcompactd_stop(int nid) } } -/* - * It's optimal to keep kcompactd on the same CPUs as their memory, but - * not required for correctness. So if the last cpu in a node goes - * away, we get changed to run anywhere: as the first one comes back, - * restore their cpu bindings. - */ -static int kcompactd_cpu_online(unsigned int cpu) -{ - int nid; - - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; - - mask = cpumask_of_node(pgdat->node_id); - - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - if (pgdat->kcompactd) - set_cpus_allowed_ptr(pgdat->kcompactd, mask); - } - return 0; -} - -static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, +static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -3219,7 +3296,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, return ret; } -static struct ctl_table vm_compaction[] = { +static const struct ctl_table vm_compaction[] = { { .procname = "compact_memory", .data = &sysctl_compact_memory, @@ -3254,21 +3331,11 @@ static struct ctl_table vm_compaction[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; static int __init kcompactd_init(void) { int nid; - int ret; - - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "mm/compaction:online", - kcompactd_cpu_online, NULL); - if (ret < 0) { - pr_err("kcompactd: failed to register hotplug callbacks.\n"); - return ret; - } for_each_node_state(nid, N_MEMORY) kcompactd_run(nid); |