diff options
Diffstat (limited to 'mm/mempolicy.c')
| -rw-r--r-- | mm/mempolicy.c | 2197 |
1 files changed, 1487 insertions, 710 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f..68a98ba57882 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -8,7 +8,7 @@ * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * - * Support four policies per VMA and per process: + * Support six policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * @@ -19,13 +19,20 @@ * for anonymous memory. For process policy an process counter * is used. * + * weighted interleave + * Allocate memory interleaved over a set of nodes based on + * a set of weights (per-node), with normal fallback if it + * fails. Otherwise operates the same as interleave. + * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated + * on node 0 for every 1 page allocated on node 1. + * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * - * preferred Try a specific node first before normal fallback. + * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default @@ -52,7 +59,7 @@ * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * - * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ @@ -78,6 +85,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> +#include <linux/sched/sysctl.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> @@ -92,6 +100,7 @@ #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/memory-tiers.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> @@ -101,17 +110,20 @@ #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> -#include <linux/swapops.h> +#include <linux/leafops.h> +#include <linux/gcd.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> +#include <linux/memory.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ -#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -130,23 +142,191 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/* + * weightiness balances the tradeoff between small weights (cycles through nodes + * faster, more fair/even distribution) and large weights (smaller errors + * between actual bandwidth ratios and weight ratios). 32 is a number that has + * been found to perform at a reasonable compromise between the two goals. + */ +static const int weightiness = 32; + +/* + * A null weighted_interleave_state is interpreted as having .mode="auto", + * and .iw_table is interpreted as an array of 1s with length nr_node_ids. + */ +struct weighted_interleave_state { + bool mode_auto; + u8 iw_table[]; +}; +static struct weighted_interleave_state __rcu *wi_state; +static unsigned int *node_bw_table; + +/* + * wi_state_lock protects both wi_state and node_bw_table. + * node_bw_table is only used by writers to update wi_state. + */ +static DEFINE_MUTEX(wi_state_lock); + +static u8 get_il_weight(int node) +{ + struct weighted_interleave_state *state; + u8 weight = 1; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + weight = state->iw_table[node]; + rcu_read_unlock(); + return weight; +} + +/* + * Convert bandwidth values into weighted interleave weights. + * Call with wi_state_lock. + */ +static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) +{ + u64 sum_bw = 0; + unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; + int nid; + + for_each_node_state(nid, N_MEMORY) + sum_bw += bw[nid]; + + /* Scale bandwidths to whole numbers in the range [1, weightiness] */ + for_each_node_state(nid, N_MEMORY) { + /* + * Try not to perform 64-bit division. + * If sum_bw < scaling_factor, then sum_bw < U32_MAX. + * If sum_bw > scaling_factor, then round the weight up to 1. + */ + scaling_factor = weightiness * bw[nid]; + if (bw[nid] && sum_bw < scaling_factor) { + cast_sum_bw = (unsigned int)sum_bw; + new_iw[nid] = scaling_factor / cast_sum_bw; + } else { + new_iw[nid] = 1; + } + if (!iw_gcd) + iw_gcd = new_iw[nid]; + iw_gcd = gcd(iw_gcd, new_iw[nid]); + } + + /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ + for_each_node_state(nid, N_MEMORY) + new_iw[nid] /= iw_gcd; +} + +int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *old_bw, *new_bw; + unsigned int bw_val; + int i; + + bw_val = min(coords->read_bandwidth, coords->write_bandwidth); + new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); + if (!new_bw) + return -ENOMEM; + + new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) { + kfree(new_bw); + return -ENOMEM; + } + new_wi_state->mode_auto = true; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + /* + * Update bandwidth info, even in manual mode. That way, when switching + * to auto mode in the future, iw_table can be overwritten using + * accurate bw data. + */ + mutex_lock(&wi_state_lock); + + old_bw = node_bw_table; + if (old_bw) + memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); + new_bw[node] = bw_val; + node_bw_table = new_bw; + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state && !old_wi_state->mode_auto) { + /* Manual mode; skip reducing weights and updating wi_state */ + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + goto out; + } + + /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ + reduce_interleave_weights(new_bw, new_wi_state->iw_table); + rcu_assign_pointer(wi_state, new_wi_state); + + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +out: + kfree(old_bw); + return 0; +} + /** - * numa_map_to_online_node - Find closest online node + * numa_nearest_node - Find nearest node by state * @node: Node id to start the search + * @state: State to filter the search * - * Lookup the next closest node by distance if @nid is not online. + * Lookup the closest node by distance if @nid is not in state. * - * Return: this @node if it is online, otherwise the closest node by distance + * Return: this @node if it is in state, otherwise the closest node by distance */ -int numa_map_to_online_node(int node) +int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; - if (node == NUMA_NO_NODE || node_online(node)) + if (state >= NR_NODE_STATES) + return -EINVAL; + + if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; - for_each_online_node(n) { + for_each_node_state(n, state) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(numa_nearest_node); + +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; @@ -156,7 +336,7 @@ int numa_map_to_online_node(int node) return min_node; } -EXPORT_SYMBOL_GPL(numa_map_to_online_node); +EXPORT_SYMBOL_GPL(nearest_node_nodemask); struct mempolicy *get_task_policy(struct task_struct *p) { @@ -176,6 +356,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) return &default_policy; } +EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm"); static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); @@ -263,9 +444,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, { struct mempolicy *policy; - pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); - if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -293,6 +471,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); @@ -305,12 +484,13 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, } /* Slow path of a mpol destructor. */ -void __mpol_put(struct mempolicy *p) +void __mpol_put(struct mempolicy *pol) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, p); + kmem_cache_free(policy_cache, pol); } +EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { @@ -366,7 +546,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) * * Called with task's alloc_lock held. */ - void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); @@ -377,15 +556,16 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ - void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); - for_each_vma(vmi, vma) + for_each_vma(vmi, vma) { + vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); + } mmap_write_unlock(mm); } @@ -412,10 +592,31 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, + [MPOL_WEIGHTED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, }; -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid); + +static bool strictly_unmovable(unsigned long flags) +{ + /* + * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO + * if any misplaced page is found. + */ + return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == + MPOL_MF_STRICT; +} + +struct migration_mpol { /* for alloc_migration_target_by_mpol() */ + struct mempolicy *pol; + pgoff_t ilx; +}; struct queue_pages { struct list_head *pagelist; @@ -424,200 +625,254 @@ struct queue_pages { unsigned long start; unsigned long end; struct vm_area_struct *first; + struct folio *large; /* note last large folio encountered */ + long nr_failed; /* could not be isolated at this time */ }; /* - * Check if the page's nid is in qp->nmask. + * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ -static inline bool queue_pages_required(struct page *page, +static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } -/* - * queue_pages_pmd() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or - * special page is met, i.e. huge zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. - * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing page was already on a node that does not follow the - * policy. - */ -static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, - unsigned long end, struct mm_walk *walk) - __releases(ptl) +static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { - int ret = 0; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; - unsigned long flags; - if (unlikely(is_pmd_migration_entry(*pmd))) { - ret = -EIO; - goto unlock; + if (unlikely(pmd_is_migration_entry(*pmd))) { + qp->nr_failed++; + return; } - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { + folio = pmd_folio(*pmd); + if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; - goto unlock; + return; } - if (!queue_pages_required(page, qp)) - goto unlock; - - flags = qp->flags; - /* go to thp migration */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - if (!vma_migratable(walk->vma) || - migrate_page_add(page, qp->pagelist, flags)) { - ret = 1; - goto unlock; - } - } else - ret = -EIO; -unlock: - spin_unlock(ptl); - return ret; + if (!queue_folio_required(folio, qp)) + return; + if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma) || + !migrate_folio_add(folio, qp->pagelist, qp->flags)) + qp->nr_failed++; } /* - * Scan through pages checking if pages follow certain conditions, - * and move them to the pagelist if they do. + * Scan through folios, checking if they satisfy the required conditions, + * moving them from LRU to local pagelist for migration if they do (or not). * - * queue_pages_pte_range() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or - * special page is met, i.e. zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. - * -EIO - only MPOL_MF_STRICT was specified and an existing page was already - * on a node that does not follow the policy. + * queue_folios_pte_range() has two possible return values: + * 0 - continue walking to scan for more, even if an existing folio on the + * wrong node could not be isolated and queued for migration. + * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, + * and an existing folio was on a node that does not follow the policy. */ -static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - bool has_unmovable = false; pte_t *pte, *mapped_pte; + pte_t ptent; spinlock_t *ptl; + int max_nr, nr; ptl = pmd_trans_huge_lock(pmd, vma); - if (ptl) - return queue_pages_pmd(pmd, ptl, addr, end, walk); + if (ptl) { + queue_folios_pmd(pmd, walk); + spin_unlock(ptl); + goto out; + } - if (pmd_trans_unstable(pmd)) + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; return 0; + } + for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { + max_nr = (end - addr) >> PAGE_SHIFT; + nr = 1; + ptent = ptep_get(pte); + if (pte_none(ptent)) + continue; + if (!pte_present(ptent)) { + const softleaf_t entry = softleaf_from_pte(ptent); - mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + if (softleaf_is_migration(entry)) + qp->nr_failed++; continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + } + folio = vm_normal_folio(vma, addr, ptent); + if (!folio || folio_is_zone_device(folio)) continue; + if (folio_test_large(folio) && max_nr != 1) + nr = folio_pte_batch(folio, pte, ptent, max_nr); /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. */ - if (PageReserved(page)) + if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(page, qp)) + if (!queue_folio_required(folio, qp)) continue; - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* MPOL_MF_STRICT must be specified if we get here */ - if (!vma_migratable(vma)) { - has_unmovable = true; - break; - } - + if (folio_test_large(folio)) { /* - * Do not abort immediately since there may be - * temporary off LRU pages in the range. Still - * need migrate other LRU pages. + * A large folio can only be isolated from LRU once, + * but may be mapped by many PTEs (and Copy-On-Write may + * intersperse PTEs of other, order 0, folios). This is + * a common case, so don't mistake it for failure (but + * there can be other cases of multi-mapped pages which + * this quick check does not help to filter out - and a + * search of the pagelist might grow to be prohibitive). + * + * migrate_pages(&pagelist) returns nr_failed folios, so + * check "large" now so that queue_pages_range() returns + * a comparable nr_failed folios. This does imply that + * if folio could not be isolated for some racy reason + * at its first PTE, later PTEs will not give it another + * chance of isolation; but keeps the accounting simple. */ - if (migrate_page_add(page, qp->pagelist, flags)) - has_unmovable = true; - } else - break; + if (folio == qp->large) + continue; + qp->large = folio; + } + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(vma) || + !migrate_folio_add(folio, qp->pagelist, flags)) { + qp->nr_failed += nr; + if (strictly_unmovable(flags)) + break; + } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); - - if (has_unmovable) - return 1; - - return addr != end ? -EIO : 0; +out: + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; + return 0; } -static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, +static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - int ret = 0; #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; - unsigned long flags = (qp->flags & MPOL_MF_VALID); - struct page *page; + unsigned long flags = qp->flags; + struct folio *folio; spinlock_t *ptl; - pte_t entry; + pte_t ptep; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); - entry = huge_ptep_get(pte); - if (!pte_present(entry)) - goto unlock; - page = pte_page(entry); - if (!queue_pages_required(page, qp)) - goto unlock; + ptep = huge_ptep_get(walk->mm, addr, pte); + if (!pte_present(ptep)) { + if (!huge_pte_none(ptep)) { + const softleaf_t entry = softleaf_from_pte(ptep); - if (flags == MPOL_MF_STRICT) { - /* - * STRICT alone means only detecting misplaced page and no - * need to further check other vma. - */ - ret = -EIO; - goto unlock; - } + if (unlikely(softleaf_is_migration(entry))) + qp->nr_failed++; + } - if (!vma_migratable(walk->vma)) { - /* - * Must be STRICT with MOVE*, otherwise .test_walk() have - * stopped walking current vma. - * Detecting misplaced page but allow migrating pages which - * have been queued. - */ - ret = 1; goto unlock; } - - /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ - if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { - if (isolate_hugetlb(page, qp->pagelist) && - (flags & MPOL_MF_STRICT)) - /* - * Failed to isolate page but allow migrating pages - * which have been queued. - */ - ret = 1; + folio = pfn_folio(pte_pfn(ptep)); + if (!queue_folio_required(folio, qp)) + goto unlock; + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma)) { + qp->nr_failed++; + goto unlock; } + /* + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. + * + * See folio_maybe_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. + */ + if ((flags & MPOL_MF_MOVE_ALL) || + (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) + if (!folio_isolate_hugetlb(folio, qp->pagelist)) + qp->nr_failed++; unlock: spin_unlock(ptl); -#else - BUG(); + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; #endif - return ret; + return 0; } #ifdef CONFIG_NUMA_BALANCING +/** + * folio_can_map_prot_numa() - check whether the folio can map prot numa + * @folio: The folio whose mapping considered for being made NUMA hintable + * @vma: The VMA that the folio belongs to. + * @is_private_single_threaded: Is this a single-threaded private VMA or not + * + * This function checks to see if the folio actually indicates that + * we need to make the mapping one which causes a NUMA hinting fault, + * as there are cases where it's simply unnecessary, and the folio's + * access time is adjusted for memory tiering if prot numa needed. + * + * Return: True if the mapping of the folio needs to be changed, false otherwise. + */ +bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, + bool is_private_single_threaded) +{ + int nid; + + if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio)) + return false; + + /* Also skip shared copy-on-write folios */ + if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) + return false; + + /* Folios are pinned and can't be migrated */ + if (folio_maybe_dma_pinned(folio)) + return false; + + /* + * While migration can move some dirty folios, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + return false; + + /* + * Don't mess with PTEs if folio is already on the node + * a single-threaded process is running on. + */ + nid = folio_nid(folio); + if (is_private_single_threaded && (nid == numa_node_id())) + return false; + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(nid)) + return false; + + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + + return true; +} + /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -631,25 +886,20 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; - int nr_updated; + long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); - nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE, - MM_CP_PROT_NUMA); - if (nr_updated) + nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); + if (nr_updated > 0) { count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); + } tlb_finish_mmu(&tlb); return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, @@ -657,7 +907,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; - unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ @@ -685,45 +934,44 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, !(flags & MPOL_MF_STRICT)) return 1; - if (endvma > end) - endvma = end; - - if (flags & MPOL_MF_LAZY) { - /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && - !(vma->vm_flags & VM_MIXEDMAP)) - change_prot_numa(vma, start, endvma); - return 1; - } - - /* queue pages from current vma */ - if (flags & MPOL_MF_VALID) + /* + * Check page nodes, and queue pages to move, in the current vma. + * But if no moving, and no strict checking, the scan can be skipped. + */ + if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { - .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, + .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * - * If pages found in a given range are on a set of nodes (determined by - * @nodes and @flags,) it's isolated and queued to the pagelist which is - * passed via @private. + * If pages found in a given range are not on the required set of @nodes, + * and migration is allowed, they are isolated and queued to @pagelist. * - * queue_pages_range() has three possible return values: - * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. - * 0 - queue pages successfully or no misplaced page. - * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or - * memory range specified by nodemask and maxnode points outside - * your accessible address space (-EFAULT) + * queue_pages_range() may return: + * 0 - all pages already on the right node, or successfully queued for moving + * (or neither strict checking nor moving requested: only range checking). + * >0 - this number of misplaced folios could not be queued for moving + * (a hugetlbfs page or a transparent huge page being counted as 1). + * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. + * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ -static int +static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) @@ -737,14 +985,16 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .end = end, .first = NULL, }; + const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? + &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; - err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; - return err; + return err ? : qp.nr_failed; } /* @@ -752,16 +1002,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, - struct mempolicy *pol) + struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); + vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) @@ -783,70 +1030,32 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start, start); - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; - pgoff_t pgoff; + unsigned long vmstart, vmend; - prev = mas_prev(&mas, 0); - if (unlikely(!prev)) - mas_set(&mas, start); + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } - vma = mas_find(&mas, end - 1); - if (WARN_ON(!vma)) + if (mpol_equal(vma->vm_policy, new_pol)) { + *prev = vma; return 0; - - if (start > vma->vm_start) - prev = vma; - - for (; vma; vma = mas_next(&mas, end - 1)) { - unsigned long vmstart = max(start, vma->vm_start); - unsigned long vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - goto next; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); - if (prev) { - /* vma_merge() invalidated the mas */ - mas_pause(&mas); - vma = prev; - goto replace; - } - if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); - if (err) - goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); - } - if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); - if (err) - goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); - } -replace: - err = vma_replace_policy(vma, new_pol); - if (err) - goto out; -next: - prev = vma; } -out: - return err; + vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -876,8 +1085,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; + current->il_weight = 0; + } task_unlock(current); mpol_put(old); ret = 0; @@ -891,18 +1103,19 @@ out: * * Called with task's alloc_lock held */ -static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (pol == &default_policy) return; - switch (p->mode) { + switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: - *nodes = p->nodes; + case MPOL_WEIGHTED_INTERLEAVE: + *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ @@ -949,6 +1162,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We @@ -960,10 +1174,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, mmap_read_unlock(mm); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); - else - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; @@ -988,6 +1199,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); + } else if (pol == current->mempolicy && + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + if (current->il_weight) + *policy = current->il_prev; + else + *policy = next_node_in(current->il_prev, + pol->nodes); } else { err = -EINVAL; goto out; @@ -1023,73 +1241,84 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -/* - * page migration, thp tail pages can be passed. - */ -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - struct page *head = compound_head(page); /* - * Avoid migrating a page that is shared with others. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. + * + * See folio_maybe_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); - } else if (flags & MPOL_MF_STRICT) { + if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { + if (folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + } else { /* - * Non-movable page may reach here. And, there may be - * temporary off LRU pages or non-LRU movable pages. - * Treat them as unmovable pages since they can't be - * isolated, so they can't be moved at the moment. It - * should return -EIO for this case too. + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be + * isolated, so they can't be moved at the moment. */ - return -EIO; + return false; } } - - return 0; + return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ -static int migrate_to_node(struct mm_struct *mm, int source, int dest, - int flags) +static long migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); - int err = 0; + long nr_failed; + long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, }; nodes_clear(nmask); node_set(source, nmask); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); + vma = find_vma(mm, 0); + if (unlikely(!vma)) { + mmap_read_unlock(mm); + return 0; + } + /* - * This does not "check" the range but isolates all pages that + * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address - * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, + * but passes back the count of pages which could not be isolated. */ - vma = find_vma(mm, 0); - VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } + if (err >= 0) + err += nr_failed; return err; } @@ -1102,14 +1331,12 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { - int busy = 0; - int err = 0; + long nr_failed = 0; + long err = 0; nodemask_t tmp; lru_cache_disable(); - mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1185,60 +1412,57 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) - busy += err; + nr_failed += err; if (err < 0) break; } - mmap_read_unlock(mm); lru_cache_enable(); if (err < 0) return err; - return busy; - + return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* - * Allocate a new page for page migration based on vma policy. - * Start by assuming the page is mapped by the same vma as contains @start. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct folio *dst, *src = page_folio(page); - struct vm_area_struct *vma; - unsigned long address; - VMA_ITERATOR(vmi, current->mm, start); - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; + struct migration_mpol *mmpol = (struct migration_mpol *)private; + struct mempolicy *pol = mmpol->pol; + pgoff_t ilx = mmpol->ilx; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp; - for_each_vma(vmi, vma) { - address = page_address_in_vma(page, vma); - if (address != -EFAULT) - break; - } + order = folio_order(src); + ilx += src->index >> order; - if (folio_test_hugetlb(src)) - return alloc_huge_page_vma(page_hstate(&src->page), - vma, address); + if (folio_test_hugetlb(src)) { + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, + htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); + } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - /* - * if !vma, vma_alloc_folio() will use task or system default policy - */ - dst = vma_alloc_folio(gfp, folio_order(src), vma, address, - folio_test_large(src)); - return &dst->page; + return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - return -EIO; + return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, @@ -1247,7 +1471,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1258,10 +1483,13 @@ static long do_mbind(unsigned long start, unsigned long len, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; + struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; - int err; - int ret; + long err; + long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) @@ -1287,9 +1515,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); - if (flags & MPOL_MF_LAZY) - new->flags |= MPOL_F_MOF; - /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1297,14 +1522,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", - start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); - } { NODEMASK_SCRATCH(scratch); if (scratch) { @@ -1319,36 +1538,84 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - ret = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); + /* + * Lock the VMAs before scanning for pages to migrate, + * to ensure we don't miss a concurrently inserted page. + */ + nr_failed = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); - if (ret < 0) { - err = ret; - goto up_out; + if (nr_failed < 0) { + err = nr_failed; + nr_failed = 0; + } else { + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } } - err = mbind_range(mm, start, end, new); - - if (!err) { - int nr_failed = 0; - - if (!list_empty(&pagelist)) { - WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_page, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); - if (nr_failed) - putback_movable_pages(&pagelist); + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); } + mmpol.pol = new; + mmpol.ilx = 0; - if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT))) - err = -EIO; - } else { -up_out: - if (!list_empty(&pagelist)) - putback_movable_pages(&pagelist); + /* + * In the interleaved case, attempt to allocate on exactly the + * targeted nodes, for the first VMA to be migrated; for later + * VMAs, the nodes will still be interleaved from the targeted + * nodemask, but one by one may be selected differently. + */ + if (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE) { + struct folio *folio; + unsigned int order; + unsigned long addr = -EFAULT; + + list_for_each_entry(folio, &pagelist, lru) { + if (!folio_test_ksm(folio)) + break; + } + if (!list_entry_is_head(folio, &pagelist, lru)) { + vma_iter_init(&vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + addr = page_address_in_vma(folio, + folio_page(folio, 0), vma); + if (addr != -EFAULT) + break; + } + } + if (addr != -EFAULT) { + order = folio_order(folio); + /* We already know the pol, but not the ilx */ + mpol_cond_put(get_vma_policy(vma, addr, order, + &mmpol.ilx)); + /* Set base from which to increment by index */ + mmpol.ilx -= folio->index >> order; + } + } } mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)&mmpol, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); + } + + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) @@ -1456,9 +1723,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { - if (*mode != MPOL_BIND) + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) + *flags |= (MPOL_F_MOF | MPOL_F_MORON); + else return -EINVAL; - *flags |= (MPOL_F_MOF | MPOL_F_MORON); } return 0; } @@ -1488,10 +1756,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct mempolicy *new; - unsigned long vmstart; - unsigned long vmend; + struct vm_area_struct *vma, *prev; + struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); @@ -1520,33 +1786,31 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); + prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - new = mpol_dup(vma_policy(vma)); - if (IS_ERR(new)) { - err = PTR_ERR(new); - break; - } - /* - * Only update home node if there is an existing vma policy - */ - if (!new) - continue; - /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ - if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { - mpol_put(new); + old = vma_policy(vma); + if (!old) { + prev = vma; + continue; + } + if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } + new = mpol_dup(old); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + vma_start_write(vma); new->home_node = home_node; - err = mbind_range(mm, vmstart, vmend, new); + err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; @@ -1673,7 +1937,6 @@ out: out_put: put_task_struct(task); goto out; - } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, @@ -1683,7 +1946,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } - /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, @@ -1750,34 +2012,20 @@ bool vma_migratable(struct vm_area_struct *vma) } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct mempolicy *pol = NULL; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr); - } else if (vma->vm_policy) { - pol = vma->vm_policy; - - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } - } - - return pol; + *ilx = 0; + return (vma->vm_ops && vma->vm_ops->get_policy) ? + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* - * get_vma_policy(@vma, @addr) + * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup + * @order: 0, or appropriate huge_page_order for interleaving + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or + * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1786,14 +2034,19 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol = __get_vma_policy(vma, addr); + struct mempolicy *pol; + pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + *ilx += vma->vm_pgoff >> order; + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } return pol; } @@ -1803,8 +2056,9 @@ bool vma_policy_mof(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; + pgoff_t ilx; /* ignored here */ - pol = vma->vm_ops->get_policy(vma, vma->vm_start); + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); @@ -1839,64 +2093,43 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } -/* - * Return a nodemask representing a mempolicy for filtering nodes for - * page allocation - */ -nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) +static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { - int mode = policy->mode; - - /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) - return &policy->nodes; + unsigned int node; + unsigned int cpuset_mems_cookie; - if (mode == MPOL_PREFERRED_MANY) - return &policy->nodes; - - return NULL; -} - -/* - * Return the preferred node id for 'prefer' mempolicy, and return - * the given id for all other policies. - * - * policy_node() is always coupled with policy_nodemask(), which - * secures the nodemask limit for 'bind' and 'prefer-many' policy. - */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) -{ - if (policy->mode == MPOL_PREFERRED) { - nd = first_node(policy->nodes); - } else { - /* - * __GFP_THISNODE shouldn't even be used with the bind policy - * because we might easily break the expectation to stay on the - * requested node and not break the policy. - */ - WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); +retry: + /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ + cpuset_mems_cookie = read_mems_allowed_begin(); + node = current->il_prev; + if (!current->il_weight || !node_isset(node, policy->nodes)) { + node = next_node_in(node, policy->nodes); + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry; + if (node == MAX_NUMNODES) + return node; + current->il_prev = node; + current->il_weight = get_il_weight(node); } - - if ((policy->mode == MPOL_BIND || - policy->mode == MPOL_PREFERRED_MANY) && - policy->home_node != NUMA_NO_NODE) - return policy->home_node; - - return nd; + current->il_weight--; + return node; } /* Do dynamic interleaving for a process */ -static unsigned interleave_nodes(struct mempolicy *policy) +static unsigned int interleave_nodes(struct mempolicy *policy) { - unsigned next; - struct task_struct *me = current; + unsigned int nid; + unsigned int cpuset_mems_cookie; + + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nid = next_node_in(current->il_prev, policy->nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); - next = next_node_in(me->il_prev, policy->nodes); - if (next < MAX_NUMNODES) - me->il_prev = next; - return next; + if (nid < MAX_NUMNODES) + current->il_prev = nid; + return nid; } /* @@ -1922,6 +2155,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); + case MPOL_WEIGHTED_INTERLEAVE: + return weighted_interleave_nodes(policy); + case MPOL_BIND: case MPOL_PREFERRED_MANY: { @@ -1936,7 +2172,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); - return z->zone ? zone_to_nid(z->zone) : node; + return zonelist_zone(z) ? zonelist_node_idx(z) : node; } case MPOL_LOCAL: return node; @@ -1946,56 +2182,128 @@ unsigned int mempolicy_slab_node(void) } } +static unsigned int read_once_policy_nodemask(struct mempolicy *pol, + nodemask_t *mask) +{ + /* + * barrier stabilizes the nodemask locally so that it can be iterated + * over safely without concern for changes. Allocators validate node + * selection does not violate mems_allowed, so this is safe. + */ + barrier(); + memcpy(mask, &pol->nodes, sizeof(nodemask_t)); + barrier(); + return nodes_weight(*mask); +} + +static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) +{ + struct weighted_interleave_state *state; + nodemask_t nodemask; + unsigned int target, nr_nodes; + u8 *table = NULL; + unsigned int weight_total = 0; + u8 weight; + int nid = 0; + + nr_nodes = read_once_policy_nodemask(pol, &nodemask); + if (!nr_nodes) + return numa_node_id(); + + rcu_read_lock(); + + state = rcu_dereference(wi_state); + /* Uninitialized wi_state means we should assume all weights are 1 */ + if (state) + table = state->iw_table; + + /* calculate the total weight */ + for_each_node_mask(nid, nodemask) + weight_total += table ? table[nid] : 1; + + /* Calculate the node offset based on totals */ + target = ilx % weight_total; + nid = first_node(nodemask); + while (target) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + rcu_read_unlock(); + return nid; +} + /* - * Do static interleaving for a VMA with known offset @n. Returns the n'th - * node in pol->nodes (starting from n=0), wrapping around if n exceeds the - * number of present nodes. + * Do static interleaving for interleave index @ilx. Returns the ilx'th + * node in pol->nodes (starting from ilx=0), wrapping around if ilx + * exceeds the number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { - nodemask_t nodemask = pol->nodes; + nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; - /* - * The barrier will stabilize the nodemask in a register or on - * the stack so that it will stop changing under the code. - * - * Between first_node() and next_node(), pol->nodes could be changed - * by other threads. So we put pol->nodes in a local stack. - */ - barrier(); - nnodes = nodes_weight(nodemask); + nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); - target = (unsigned int)n % nnodes; + target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation, together with preferred node id (or the input node id). + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid) { - if (vma) { - unsigned long off; + nodemask_t *nodemask = NULL; + switch (pol->mode) { + case MPOL_PREFERRED: + /* Override input node id */ + *nid = first_node(pol->nodes); + break; + case MPOL_PREFERRED_MANY: + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + break; + case MPOL_BIND: + /* Restrict to nodemask (but not on lower zones) */ + if (apply_policy_zone(pol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, off); - } else - return interleave_nodes(pol); + WARN_ON_ONCE(gfp & __GFP_THISNODE); + break; + case MPOL_INTERLEAVE: + /* Override input node id */ + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + interleave_nodes(pol) : interleave_nid(pol, ilx); + break; + case MPOL_WEIGHTED_INTERLEAVE: + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + weighted_interleave_nodes(pol) : + weighted_interleave_nid(pol, ilx); + break; + } + + return nodemask; } #ifdef CONFIG_HUGETLBFS @@ -2011,27 +2319,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. - * - * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, - struct mempolicy **mpol, nodemask_t **nodemask) + struct mempolicy **mpol, nodemask_t **nodemask) { + pgoff_t ilx; int nid; - int mode; - *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; - mode = (*mpol)->mode; - - if (unlikely(mode == MPOL_INTERLEAVE)) { - nid = interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))); - } else { - nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) - *nodemask = &(*mpol)->nodes; - } + nid = numa_node_id(); + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } @@ -2065,6 +2362,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2109,27 +2407,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, return ret; } -/* Allocate a page in interleaved policy. - Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) -{ - struct page *page; - - page = __alloc_pages(gfp, order, nid, NULL); - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ - if (!static_branch_likely(&vm_numa_stat_key)) - return page; - if (page && page_to_nid(page) == nid) { - preempt_disable(); - __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); - preempt_enable(); - } - return page; -} - static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - int nid, struct mempolicy *pol) + int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; @@ -2142,69 +2421,37 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages(gfp, order, nid, NULL); + page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } /** - * vma_alloc_folio - Allocate a folio for a VMA. + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. - * @order: Order of the folio. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: For hugepages try only the preferred node if possible. + * @order: Order of the page allocation. + * @pol: Pointer to the NUMA mempolicy. + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * - * Allocate a folio for a specific address in @vma, using the appropriate - * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock - * of the mm_struct of the VMA to prevent it from going away. Should be - * used for all allocations for folios that will be mapped into user space. - * - * Return: The folio on success or NULL if allocation fails. + * Return: The page on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) +static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) { - struct mempolicy *pol; - int node = numa_node_id(); - struct folio *folio; - int preferred_nid; - nodemask_t *nmask; - - pol = get_vma_policy(vma, addr); - - if (pol->mode == MPOL_INTERLEAVE) { - struct page *page; - unsigned nid; - - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - gfp |= __GFP_COMP; - page = alloc_page_interleave(gfp, order, nid); - if (page && order > 1) - prep_transhuge_page(page); - folio = (struct folio *)page; - goto out; - } + nodemask_t *nodemask; + struct page *page; - if (pol->mode == MPOL_PREFERRED_MANY) { - struct page *page; + nodemask = policy_nodemask(gfp, pol, ilx, &nid); - node = policy_node(gfp, pol, node); - gfp |= __GFP_COMP; - page = alloc_pages_preferred_many(gfp, order, node, pol); - mpol_cond_put(pol); - if (page && order > 1) - prep_transhuge_page(page); - folio = (struct folio *)page; - goto out; - } - - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_preferred_many(gfp, order, nid, nodemask); + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + /* filter "hugepage" allocation, unless from alloc_pages() */ + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred @@ -2215,41 +2462,101 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED) - hpage_node = first_node(pol->nodes); - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); + if (pol->mode != MPOL_INTERLEAVE && + pol->mode != MPOL_WEIGHTED_INTERLEAVE && + (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - folio = __folio_alloc_node(gfp | __GFP_THISNODE | - __GFP_NORETRY, order, hpage_node); - + page = __alloc_frozen_pages_noprof( + gfp | __GFP_THISNODE | __GFP_NORETRY, order, + nid, NULL); + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) + return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) - folio = __folio_alloc(gfp, order, hpage_node, - nmask); + } + } - goto out; + page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); + + if (unlikely(pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ + if (static_branch_likely(&vm_numa_stat_key) && + page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); } } - nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); - folio = __folio_alloc(gfp, order, preferred_nid, nmask); + return page; +} + +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) +{ + struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, + ilx, nid); + if (!page) + return NULL; + + set_page_refcounted(page); + return page_rmappable_folio(page); +} + +/** + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA. + * @addr: Virtual address of the allocation. Must be inside @vma. + * + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the + * VMA to prevent it from going away. Should be used for all allocations + * for folios that will be mapped into user space, excepting hugetlbfs, and + * excepting where direct use of folio_alloc_mpol() is more appropriate. + * + * Return: The folio on success or NULL if allocation fails. + */ +struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol; + pgoff_t ilx; + struct folio *folio; + + if (vma->vm_flags & VM_DROPPABLE) + gfp |= __GFP_NOWARN; + + pol = get_vma_policy(vma, addr, order, &ilx); + folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); -out: return folio; } -EXPORT_SYMBOL(vma_alloc_folio); +EXPORT_SYMBOL(vma_alloc_folio_noprof); + +struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); +} /** * alloc_pages - Allocate pages. @@ -2265,43 +2572,23 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned order) +struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { - struct mempolicy *pol = &default_policy; - struct page *page; - - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); - - /* - * No reference counting needed for current->mempolicy - * nor system default_policy - */ - if (pol->mode == MPOL_INTERLEAVE) - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); - else if (pol->mode == MPOL_PREFERRED_MANY) - page = alloc_pages_preferred_many(gfp, order, - policy_node(gfp, pol, numa_node_id()), pol); - else - page = __alloc_pages(gfp, order, - policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + struct page *page = alloc_frozen_pages_noprof(gfp, order); + if (page) + set_page_refcounted(page); return page; } -EXPORT_SYMBOL(alloc_pages); +EXPORT_SYMBOL(alloc_pages_noprof); -struct folio *folio_alloc(gfp_t gfp, unsigned order) +struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - struct page *page = alloc_pages(gfp | __GFP_COMP, order); - - if (page && order > 1) - prep_transhuge_page(page); - return (struct folio *)page; + return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); } -EXPORT_SYMBOL(folio_alloc); +EXPORT_SYMBOL(folio_alloc_noprof); -static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2318,15 +2605,15 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, for (i = 0; i < nodes; i++) { if (delta) { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node + 1, NULL, + nr_pages_per_node + 1, page_array); delta--; } else { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node, NULL, page_array); + nr_pages_per_node, page_array); } page_array += nr_allocated; @@ -2336,7 +2623,125 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, +static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + struct weighted_interleave_state *state; + struct task_struct *me = current; + unsigned int cpuset_mems_cookie; + unsigned long total_allocated = 0; + unsigned long nr_allocated = 0; + unsigned long rounds; + unsigned long node_pages, delta; + u8 *weights, weight; + unsigned int weight_total = 0; + unsigned long rem_pages = nr_pages; + nodemask_t nodes; + int nnodes, node; + int resume_node = MAX_NUMNODES - 1; + u8 resume_weight = 0; + int prev_node; + int i; + + if (!nr_pages) + return 0; + + /* read the nodes onto the stack, retry if done during rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nnodes = read_once_policy_nodemask(pol, &nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + /* if the nodemask has become invalid, we cannot do anything */ + if (!nnodes) + return 0; + + /* Continue allocating from most recent node and adjust the nr_pages */ + node = me->il_prev; + weight = me->il_weight; + if (weight && node_isset(node, nodes)) { + node_pages = min(rem_pages, weight); + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (rem_pages <= weight) { + me->il_weight -= rem_pages; + return total_allocated; + } + /* Otherwise we adjust remaining pages, continue from there */ + rem_pages -= weight; + } + /* clear active weight in case of an allocation failure */ + me->il_weight = 0; + prev_node = node; + + /* create a local copy of node weights to operate on outside rcu */ + weights = kzalloc(nr_node_ids, GFP_KERNEL); + if (!weights) + return total_allocated; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) { + memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + for (i = 0; i < nr_node_ids; i++) + weights[i] = 1; + } + + /* calculate total, detect system default usage */ + for_each_node_mask(node, nodes) + weight_total += weights[node]; + + /* + * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. + * Track which node weighted interleave should resume from. + * + * if (rounds > 0) and (delta == 0), resume_node will always be + * the node following prev_node and its weight. + */ + rounds = rem_pages / weight_total; + delta = rem_pages % weight_total; + resume_node = next_node_in(prev_node, nodes); + resume_weight = weights[resume_node]; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, nodes); + weight = weights[node]; + node_pages = weight * rounds; + /* If a delta exists, add this node's portion of the delta */ + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else if (delta) { + /* when delta is depleted, resume from that node */ + node_pages += delta; + resume_node = node; + resume_weight = weight - delta; + delta = 0; + } + /* node_pages can be 0 if an allocation fails and rounds == 0 */ + if (!node_pages) + break; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + if (total_allocated == nr_pages) + break; + prev_node = node; + } + me->il_prev = resume_node; + me->il_weight = resume_weight; + kfree(weights); + return total_allocated; +} + +static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2346,12 +2751,12 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, - nr_pages, NULL, page_array); + nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, + nr_pages, page_array); if (nr_allocated < nr_pages) - nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, - nr_pages - nr_allocated, NULL, + nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, + nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } @@ -2362,30 +2767,37 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; + nodemask_t *nodemask; + int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) - return alloc_pages_bulk_array_interleave(gfp, pol, + return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return alloc_pages_bulk_weighted_interleave( + gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_PREFERRED_MANY) - return alloc_pages_bulk_array_preferred_many(gfp, + return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol), nr_pages, NULL, - page_array); + nid = numa_node_id(); + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); + return alloc_pages_bulk_noprof(gfp, nid, nodemask, + nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { - struct mempolicy *pol = mpol_dup(vma_policy(src)); + struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); @@ -2448,6 +2860,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2470,8 +2883,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ -static struct sp_node * -sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +static struct sp_node *sp_lookup(struct shared_policy *sp, + pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; @@ -2522,13 +2935,11 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, - new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ -struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, + pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; @@ -2544,6 +2955,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) read_unlock(&sp->lock); return pol; } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm"); static void sp_free(struct sp_node *n) { @@ -2552,39 +2964,48 @@ static void sp_free(struct sp_node *n) } /** - * mpol_misplaced - check whether current page node is valid in policy + * mpol_misplaced - check whether current folio node is valid in policy * - * @page: page to be checked - * @vma: vm area where page mapped - * @addr: virtual address where page mapped + * @folio: folio to be checked + * @vmf: structure describing the fault + * @addr: virtual address in @vma for shared policy lookup and interleave policy * - * Lookup current policy node id for vma,addr and "compare to" page's + * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this - * policy, or a suitable node ID to allocate a replacement page from. + * policy, or a suitable node ID to allocate a replacement folio from. */ -int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, + unsigned long addr) { struct mempolicy *pol; + pgoff_t ilx; struct zoneref *z; - int curnid = page_to_nid(page); - unsigned long pgoff; + int curnid = folio_nid(folio); + struct vm_area_struct *vma = vmf->vma; int thiscpu = raw_smp_processor_id(); - int thisnid = cpu_to_node(thiscpu); + int thisnid = numa_node_id(); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; - pol = get_vma_policy(vma, addr); + /* + * Make sure ptl is held so that we don't preempt and we + * have a stable smp processor id + */ + lockdep_assert_held(vmf->ptl); + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, pgoff); + polnid = interleave_nid(pol, ilx); + break; + + case MPOL_WEIGHTED_INTERLEAVE: + polnid = weighted_interleave_nid(pol, ilx); break; case MPOL_PREFERRED: @@ -2598,15 +3019,26 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: - /* Optimize placement among multiple nodes via NUMA balancing */ + case MPOL_PREFERRED_MANY: + /* + * Even though MPOL_PREFERRED_MANY can allocate pages outside + * policy nodemask we don't allow numa migration to nodes + * outside policy nodemask for now. This is done so that if we + * want demotion to slow memory to happen, before allocating + * from some DRAM node say 'x', we will end up using a + * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario + * we should not promote to node 'x' from slow memory node. + */ if (pol->flags & MPOL_F_MORON) { + /* + * Optimize placement among multiple nodes + * via NUMA balancing + */ if (node_isset(thisnid, pol->nodes)) break; goto out; } - fallthrough; - case MPOL_PREFERRED_MANY: /* * use current page if in policy nodemask, * else select nearest allowed node, if any. @@ -2615,21 +3047,22 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( - node_zonelist(numa_node_id(), GFP_HIGHUSER), + node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); - polnid = zone_to_nid(z->zone); + polnid = zonelist_node_idx(z); break; default: BUG(); } - /* Migrate the page towards the node whose CPU is referencing it */ + /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; - if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) + if (!should_numa_migrate_memory(current, folio, curnid, + thiscpu)) goto out; } @@ -2660,7 +3093,6 @@ void mpol_put_task_policy(struct task_struct *task) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } @@ -2695,8 +3127,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, } /* Replace a policy range. */ -static int shared_policy_replace(struct shared_policy *sp, unsigned long start, - unsigned long end, struct sp_node *new) +static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, + pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; @@ -2779,78 +3211,75 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) rwlock_init(&sp->lock); if (mpol) { - struct vm_area_struct pvma; - struct mempolicy *new; + struct sp_node *sn; + struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; - /* contextualize the tmpfs mount point mempolicy */ - new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) + + /* contextualize the tmpfs mount point mempolicy to this file */ + npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) - goto put_new; - - /* Create pseudo-vma that contains just the policy */ - vma_init(&pvma, NULL); - pvma.vm_end = TASK_SIZE; /* policy covers entire file */ - mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ - -put_new: - mpol_put(new); /* drop initial ref */ + goto put_npol; + + /* alloc node covering entire file; adds ref to file's npol */ + sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); + if (sn) + sp_insert(sp, sn); +put_npol: + mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm"); -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, struct mempolicy *npol) +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", - vma->vm_pgoff, - sz, npol ? npol->mode : -1, - npol ? npol->flags : -1, - npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); - - if (npol) { - new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (pol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } - err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } +EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm"); /* Free a backing policy store on inode delete. */ -void mpol_free_shared_policy(struct shared_policy *p) +void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; - if (!p->root.rb_node) + if (!sp->root.rb_node) return; - write_lock(&p->lock); - next = rb_first(&p->root); + write_lock(&sp->lock); + next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - sp_delete(p, n); + sp_delete(sp, n); } - write_unlock(&p->lock); + write_unlock(&sp->lock); } +EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm"); #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; @@ -2899,7 +3328,6 @@ static inline void __init check_numabalancing_enable(void) } #endif /* CONFIG_NUMA_BALANCING */ -/* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; @@ -2962,18 +3390,17 @@ void numa_default_policy(void) /* * Parse and format mempolicy from/to strings */ - static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", + [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; - #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. @@ -3029,6 +3456,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } break; case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ @@ -3116,8 +3544,9 @@ out: * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. - * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the - * longest flag, "relative", and to display at least a few node ids. + * Recommend a @maxlen of at least 51 for the longest mode, "weighted + * interleave", plus the longest flag flags, "relative|balancing", and to + * display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { @@ -3126,7 +3555,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; - if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + if (pol && + pol != &default_policy && + !(pol >= &preferred_node_policy[0] && + pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; flags = pol->flags; } @@ -3139,6 +3571,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: @@ -3153,15 +3586,359 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += snprintf(p, buffer + maxlen - p, "="); /* - * Currently, the only defined flags are mutually exclusive + * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); + + if (flags & MPOL_F_NUMA_BALANCING) { + if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) + p += snprintf(p, buffer + maxlen - p, "|"); + p += snprintf(p, buffer + maxlen - p, "balancing"); + } } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +#ifdef CONFIG_SYSFS +struct iw_node_attr { + struct kobj_attribute kobj_attr; + int nid; +}; + +struct sysfs_wi_group { + struct kobject wi_kobj; + struct mutex kobj_lock; + struct iw_node_attr *nattrs[]; +}; + +static struct sysfs_wi_group *wi_group; + +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct iw_node_attr *node_attr; + u8 weight; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + weight = get_il_weight(node_attr->nid); + return sysfs_emit(buf, "%d\n", weight); +} + +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + struct iw_node_attr *node_attr; + u8 weight = 0; + int i; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + if (count == 0 || sysfs_streq(buf, "") || + kstrtou8(buf, 0, &weight) || weight == 0) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; + + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state) { + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + new_wi_state->iw_table[node_attr->nid] = weight; + new_wi_state->mode_auto = false; + + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static ssize_t weighted_interleave_auto_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct weighted_interleave_state *state; + bool wi_auto = true; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + wi_auto = state->mode_auto; + rcu_read_unlock(); + + return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); +} + +static ssize_t weighted_interleave_auto_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *bw; + bool input; + int i; + + if (kstrtobool(buf, &input)) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + mutex_lock(&wi_state_lock); + if (!input) { + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) + goto update_wi_state; + if (input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + return count; + } + + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + goto update_wi_state; + } + + bw = node_bw_table; + if (!bw) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return -ENODEV; + } + + new_wi_state->mode_auto = true; + reduce_interleave_weights(bw, new_wi_state->iw_table); + +update_wi_state: + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static void sysfs_wi_node_delete(int nid) +{ + struct iw_node_attr *attr; + + if (nid < 0 || nid >= nr_node_ids) + return; + + mutex_lock(&wi_group->kobj_lock); + attr = wi_group->nattrs[nid]; + if (!attr) { + mutex_unlock(&wi_group->kobj_lock); + return; + } + + wi_group->nattrs[nid] = NULL; + mutex_unlock(&wi_group->kobj_lock); + + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); + kfree(attr->kobj_attr.attr.name); + kfree(attr); +} + +static void sysfs_wi_node_delete_all(void) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) + sysfs_wi_node_delete(nid); +} + +static void wi_state_free(void) +{ + struct weighted_interleave_state *old_wi_state; + + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + rcu_assign_pointer(wi_state, NULL); + mutex_unlock(&wi_state_lock); + + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +} + +static struct kobj_attribute wi_auto_attr = + __ATTR(auto, 0664, weighted_interleave_auto_show, + weighted_interleave_auto_store); + +static void wi_cleanup(void) { + sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + sysfs_wi_node_delete_all(); + wi_state_free(); +} + +static void wi_kobj_release(struct kobject *wi_kobj) +{ + kfree(wi_group); +} + +static const struct kobj_type wi_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = wi_kobj_release, +}; + +static int sysfs_wi_node_add(int nid) +{ + int ret; + char *name; + struct iw_node_attr *new_attr; + + if (nid < 0 || nid >= nr_node_ids) { + pr_err("invalid node id: %d\n", nid); + return -EINVAL; + } + + new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); + if (!new_attr) + return -ENOMEM; + + name = kasprintf(GFP_KERNEL, "node%d", nid); + if (!name) { + kfree(new_attr); + return -ENOMEM; + } + + sysfs_attr_init(&new_attr->kobj_attr.attr); + new_attr->kobj_attr.attr.name = name; + new_attr->kobj_attr.attr.mode = 0644; + new_attr->kobj_attr.show = node_show; + new_attr->kobj_attr.store = node_store; + new_attr->nid = nid; + + mutex_lock(&wi_group->kobj_lock); + if (wi_group->nattrs[nid]) { + mutex_unlock(&wi_group->kobj_lock); + ret = -EEXIST; + goto out; + } + + ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); + if (ret) { + mutex_unlock(&wi_group->kobj_lock); + goto out; + } + wi_group->nattrs[nid] = new_attr; + mutex_unlock(&wi_group->kobj_lock); + return 0; + +out: + kfree(new_attr->kobj_attr.attr.name); + kfree(new_attr); + return ret; +} + +static int wi_node_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err; + struct node_notify *nn = data; + int nid = nn->nid; + + switch (action) { + case NODE_ADDED_FIRST_MEMORY: + err = sysfs_wi_node_add(nid); + if (err) + pr_err("failed to add sysfs for node%d during hotplug: %d\n", + nid, err); + break; + case NODE_REMOVED_LAST_MEMORY: + sysfs_wi_node_delete(nid); + break; + } + + return NOTIFY_OK; +} + +static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) +{ + int nid, err; + + wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), + GFP_KERNEL); + if (!wi_group) + return -ENOMEM; + mutex_init(&wi_group->kobj_lock); + + err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, + "weighted_interleave"); + if (err) + goto err_put_kobj; + + err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + if (err) + goto err_put_kobj; + + for_each_online_node(nid) { + if (!node_state(nid, N_MEMORY)) + continue; + + err = sysfs_wi_node_add(nid); + if (err) { + pr_err("failed to add sysfs for node%d during init: %d\n", + nid, err); + goto err_cleanup_kobj; + } + } + + hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); + return 0; + +err_cleanup_kobj: + wi_cleanup(); + kobject_del(&wi_group->wi_kobj); +err_put_kobj: + kobject_put(&wi_group->wi_kobj); + return err; +} + +static int __init mempolicy_sysfs_init(void) +{ + int err; + static struct kobject *mempolicy_kobj; + + mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); + if (!mempolicy_kobj) + return -ENOMEM; + + err = add_weighted_interleave_group(mempolicy_kobj); + if (err) + goto err_kobj; + + return 0; + +err_kobj: + kobject_del(mempolicy_kobj); + kobject_put(mempolicy_kobj); + return err; +} + +late_initcall(mempolicy_sysfs_init); +#endif /* CONFIG_SYSFS */ |
