diff options
Diffstat (limited to 'mm/mempolicy.c')
| -rw-r--r-- | mm/mempolicy.c | 3482 |
1 files changed, 2285 insertions, 1197 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 74310017296e..68a98ba57882 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1,14 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. - * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * - * Support four policies per VMA and per process: + * Support six policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * @@ -19,18 +19,28 @@ * for anonymous memory. For process policy an process counter * is used. * + * weighted interleave + * Allocate memory interleaved over a set of nodes based on + * a set of weights (per-node), with normal fallback if it + * fails. Otherwise operates the same as interleave. + * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated + * on node 0 for every 1 page allocated on node 1. + * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * - * preferred Try a specific node first before normal fallback. + * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -49,7 +59,7 @@ * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * - * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ @@ -65,12 +75,18 @@ kernel is not always grateful with that. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mempolicy.h> -#include <linux/mm.h> +#include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/numa_balancing.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> @@ -80,9 +96,11 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> +#include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/memory-tiers.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> @@ -91,16 +109,21 @@ #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> +#include <linux/printk.h> +#include <linux/leafops.h> +#include <linux/gcd.h> #include <asm/tlbflush.h> -#include <asm/uaccess.h> -#include <linux/random.h> +#include <asm/tlb.h> +#include <linux/uaccess.h> +#include <linux/memory.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ -#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -114,56 +137,232 @@ enum zone_type policy_zone = 0; */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ - .mode = MPOL_PREFERRED, - .flags = MPOL_F_LOCAL, + .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; -static struct mempolicy *get_task_policy(struct task_struct *p) +/* + * weightiness balances the tradeoff between small weights (cycles through nodes + * faster, more fair/even distribution) and large weights (smaller errors + * between actual bandwidth ratios and weight ratios). 32 is a number that has + * been found to perform at a reasonable compromise between the two goals. + */ +static const int weightiness = 32; + +/* + * A null weighted_interleave_state is interpreted as having .mode="auto", + * and .iw_table is interpreted as an array of 1s with length nr_node_ids. + */ +struct weighted_interleave_state { + bool mode_auto; + u8 iw_table[]; +}; +static struct weighted_interleave_state __rcu *wi_state; +static unsigned int *node_bw_table; + +/* + * wi_state_lock protects both wi_state and node_bw_table. + * node_bw_table is only used by writers to update wi_state. + */ +static DEFINE_MUTEX(wi_state_lock); + +static u8 get_il_weight(int node) +{ + struct weighted_interleave_state *state; + u8 weight = 1; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + weight = state->iw_table[node]; + rcu_read_unlock(); + return weight; +} + +/* + * Convert bandwidth values into weighted interleave weights. + * Call with wi_state_lock. + */ +static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw) +{ + u64 sum_bw = 0; + unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0; + int nid; + + for_each_node_state(nid, N_MEMORY) + sum_bw += bw[nid]; + + /* Scale bandwidths to whole numbers in the range [1, weightiness] */ + for_each_node_state(nid, N_MEMORY) { + /* + * Try not to perform 64-bit division. + * If sum_bw < scaling_factor, then sum_bw < U32_MAX. + * If sum_bw > scaling_factor, then round the weight up to 1. + */ + scaling_factor = weightiness * bw[nid]; + if (bw[nid] && sum_bw < scaling_factor) { + cast_sum_bw = (unsigned int)sum_bw; + new_iw[nid] = scaling_factor / cast_sum_bw; + } else { + new_iw[nid] = 1; + } + if (!iw_gcd) + iw_gcd = new_iw[nid]; + iw_gcd = gcd(iw_gcd, new_iw[nid]); + } + + /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */ + for_each_node_state(nid, N_MEMORY) + new_iw[nid] /= iw_gcd; +} + +int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *old_bw, *new_bw; + unsigned int bw_val; + int i; + + bw_val = min(coords->read_bandwidth, coords->write_bandwidth); + new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); + if (!new_bw) + return -ENOMEM; + + new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) { + kfree(new_bw); + return -ENOMEM; + } + new_wi_state->mode_auto = true; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + /* + * Update bandwidth info, even in manual mode. That way, when switching + * to auto mode in the future, iw_table can be overwritten using + * accurate bw data. + */ + mutex_lock(&wi_state_lock); + + old_bw = node_bw_table; + if (old_bw) + memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw)); + new_bw[node] = bw_val; + node_bw_table = new_bw; + + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state && !old_wi_state->mode_auto) { + /* Manual mode; skip reducing weights and updating wi_state */ + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + goto out; + } + + /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/ + reduce_interleave_weights(new_bw, new_wi_state->iw_table); + rcu_assign_pointer(wi_state, new_wi_state); + + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +out: + kfree(old_bw); + return 0; +} + +/** + * numa_nearest_node - Find nearest node by state + * @node: Node id to start the search + * @state: State to filter the search + * + * Lookup the closest node by distance if @nid is not in state. + * + * Return: this @node if it is in state, otherwise the closest node by distance + */ +int numa_nearest_node(int node, unsigned int state) +{ + int min_dist = INT_MAX, dist, n, min_node; + + if (state >= NR_NODE_STATES) + return -EINVAL; + + if (node == NUMA_NO_NODE || node_state(node, state)) + return node; + + min_node = node; + for_each_node_state(n, state) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(numa_nearest_node); + +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(nearest_node_nodemask); + +struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; - if (!pol) { - node = numa_node_id(); - if (node != NUMA_NO_NODE) - pol = &preferred_node_policy[node]; + if (pol) + return pol; + node = numa_node_id(); + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ - if (!pol->mode) - pol = NULL; + if (pol->mode) + return pol; } - return pol; + return &default_policy; } +EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm"); static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - /* - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; -/* Check that the nodemask contains at least one populated zone */ -static int is_valid_nodemask(const nodemask_t *nodemask) -{ - return nodes_intersects(*nodemask, node_states[N_MEMORY]); -} - static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; @@ -177,74 +376,62 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, nodes_onto(*ret, tmp, *rel); } -static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; - pol->v.nodes = *nodes; + pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { - if (!nodes) - pol->flags |= MPOL_F_LOCAL; /* local allocation */ - else if (nodes_empty(*nodes)) - return -EINVAL; /* no allowed nodes */ - else - pol->v.preferred_node = first_node(*nodes); - return 0; -} - -static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) -{ - if (!is_valid_nodemask(nodes)) + if (nodes_empty(*nodes)) return -EINVAL; - pol->v.nodes = *nodes; + + nodes_clear(pol->nodes); + node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes - * parameter with respect to the policy mode and flags. But, we need to - * handle an empty nodemask with MPOL_PREFERRED here. + * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed - * and mempolicy. May also be called holding the mmap_semaphore for write. + * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; - /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ - if (pol == NULL) + /* + * Default (pol==NULL) resp. local memory policies are not a + * subject of any remapping. They also do not need any special + * constructor. + */ + if (!pol || pol->mode == MPOL_LOCAL) return 0; + /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); - if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) - nodes = NULL; /* explicit local allocation */ - else { - if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); - else - nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (mpol_store_user_nodemask(pol)) - pol->w.user_nodemask = *nodes; - else - pol->w.cpuset_mems_allowed = - cpuset_current_mems_allowed; - } + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); - if (nodes) - ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; else - ret = mpol_ops[pol->mode].create(pol, NULL); + pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; + + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } @@ -257,9 +444,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, { struct mempolicy *policy; - pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); - if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -277,44 +461,42 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); + + mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { - if (!nodes_empty(*nodes)) + if (!nodes_empty(*nodes) || + (flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); - mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ -void __mpol_put(struct mempolicy *p) +void __mpol_put(struct mempolicy *pol) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, p); + kmem_cache_free(policy_cache, pol); } +EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } -/* - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; @@ -323,104 +505,39 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - /* - * if step == 1, we use ->w.cpuset_mems_allowed to cache the - * result - */ - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { - nodes_remap(tmp, pol->v.nodes, - pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; - } else if (step == MPOL_REBIND_STEP2) { - tmp = pol->w.cpuset_mems_allowed; - pol->w.cpuset_mems_allowed = *nodes; - } else - BUG(); + nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; - if (step == MPOL_REBIND_STEP1) - nodes_or(pol->v.nodes, pol->v.nodes, tmp); - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) - pol->v.nodes = tmp; - else - BUG(); - - if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = first_node(tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = numa_node_id(); - } + pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes, - enum mpol_rebind_step step) + const nodemask_t *nodes) { - nodemask_t tmp; - - if (pol->flags & MPOL_F_STATIC_NODES) { - int node = first_node(pol->w.user_nodemask); - - if (node_isset(node, *nodes)) { - pol->v.preferred_node = node; - pol->flags &= ~MPOL_F_LOCAL; - } else - pol->flags |= MPOL_F_LOCAL; - } else if (pol->flags & MPOL_F_RELATIVE_NODES) { - mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); - pol->v.preferred_node = first_node(tmp); - } else if (!(pol->flags & MPOL_F_LOCAL)) { - pol->v.preferred_node = node_remap(pol->v.preferred_node, - pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; - } + pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_lock. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, - enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { - if (!pol) + if (!pol || pol->mode == MPOL_LOCAL) return; - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) - return; - - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) - BUG(); - - if (step == MPOL_REBIND_STEP1) - pol->flags |= MPOL_F_REBINDING; - else if (step == MPOL_REBIND_STEP2) - pol->flags &= ~MPOL_F_REBINDING; - else if (step >= MPOL_REBIND_NSTEP) - BUG(); - - mpol_ops[pol->mode].rebind(pol, newmask, step); + mpol_ops[pol->mode].rebind(pol, newmask); } /* @@ -429,27 +546,27 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, * * Called with task's alloc_lock held. */ - -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - mpol_rebind_policy(tsk->mempolicy, new, step); + mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * - * Call holding a reference to mm. Takes mm->mmap_sem during call. + * Call holding a reference to mm. Takes mm->mmap_lock during call. */ - void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - down_write(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + vma_start_write(vma); + mpol_rebind_policy(vma->vm_policy, new); + } + mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { @@ -457,7 +574,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { - .create = mpol_new_interleave, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { @@ -465,115 +582,297 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { - .create = mpol_new_bind, + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, + [MPOL_LOCAL] = { + .rebind = mpol_rebind_default, + }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_preferred, + }, + [MPOL_WEIGHTED_INTERLEAVE] = { + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, }; -static void migrate_page_add(struct page *page, struct list_head *pagelist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid); + +static bool strictly_unmovable(unsigned long flags) +{ + /* + * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO + * if any misplaced page is found. + */ + return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == + MPOL_MF_STRICT; +} + +struct migration_mpol { /* for alloc_migration_target_by_mpol() */ + struct mempolicy *pol; + pgoff_t ilx; +}; + +struct queue_pages { + struct list_head *pagelist; + unsigned long flags; + nodemask_t *nmask; + unsigned long start; + unsigned long end; + struct vm_area_struct *first; + struct folio *large; /* note last large folio encountered */ + long nr_failed; /* could not be isolated at this time */ +}; + +/* + * Check if the folio's nid is in qp->nmask. + * + * If MPOL_MF_INVERT is set in qp->flags, check if the nid is + * in the invert of qp->nmask. + */ +static inline bool queue_folio_required(struct folio *folio, + struct queue_pages *qp) +{ + int nid = folio_nid(folio); + unsigned long flags = qp->flags; + + return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); +} -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) +static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { - pte_t *orig_pte; - pte_t *pte; + struct folio *folio; + struct queue_pages *qp = walk->private; + + if (unlikely(pmd_is_migration_entry(*pmd))) { + qp->nr_failed++; + return; + } + folio = pmd_folio(*pmd); + if (is_huge_zero_folio(folio)) { + walk->action = ACTION_CONTINUE; + return; + } + if (!queue_folio_required(folio, qp)) + return; + if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma) || + !migrate_folio_add(folio, qp->pagelist, qp->flags)) + qp->nr_failed++; +} + +/* + * Scan through folios, checking if they satisfy the required conditions, + * moving them from LRU to local pagelist for migration if they do (or not). + * + * queue_folios_pte_range() has two possible return values: + * 0 - continue walking to scan for more, even if an existing folio on the + * wrong node could not be isolated and queued for migration. + * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, + * and an existing folio was on a node that does not follow the policy. + */ +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct folio *folio; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + pte_t *pte, *mapped_pte; + pte_t ptent; spinlock_t *ptl; + int max_nr, nr; - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - do { - struct page *page; - int nid; + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + queue_folios_pmd(pmd, walk); + spin_unlock(ptl); + goto out; + } + + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } + for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) { + max_nr = (end - addr) >> PAGE_SHIFT; + nr = 1; + ptent = ptep_get(pte); + if (pte_none(ptent)) + continue; + if (!pte_present(ptent)) { + const softleaf_t entry = softleaf_from_pte(ptent); - if (!pte_present(*pte)) + if (softleaf_is_migration(entry)) + qp->nr_failed++; continue; - page = vm_normal_page(vma, addr, *pte); - if (!page) + } + folio = vm_normal_folio(vma, addr, ptent); + if (!folio || folio_is_zone_device(folio)) continue; + if (folio_test_large(folio) && max_nr != 1) + nr = folio_pte_batch(folio, pte, ptent, max_nr); /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. */ - if (PageReserved(page)) + if (folio_test_reserved(folio)) continue; - nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + if (!queue_folio_required(folio, qp)) continue; - - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, private, flags); - else - break; - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(orig_pte, ptl); - return addr != end; + if (folio_test_large(folio)) { + /* + * A large folio can only be isolated from LRU once, + * but may be mapped by many PTEs (and Copy-On-Write may + * intersperse PTEs of other, order 0, folios). This is + * a common case, so don't mistake it for failure (but + * there can be other cases of multi-mapped pages which + * this quick check does not help to filter out - and a + * search of the pagelist might grow to be prohibitive). + * + * migrate_pages(&pagelist) returns nr_failed folios, so + * check "large" now so that queue_pages_range() returns + * a comparable nr_failed folios. This does imply that + * if folio could not be isolated for some racy reason + * at its first PTE, later PTEs will not give it another + * chance of isolation; but keeps the accounting simple. + */ + if (folio == qp->large) + continue; + qp->large = folio; + } + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(vma) || + !migrate_folio_add(folio, qp->pagelist, flags)) { + qp->nr_failed += nr; + if (strictly_unmovable(flags)) + break; + } + } + pte_unmap_unlock(mapped_pte, ptl); + cond_resched(); +out: + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; + return 0; } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) +static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { - pmd_t *pmd; - unsigned long next; +#ifdef CONFIG_HUGETLB_PAGE + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + struct folio *folio; + spinlock_t *ptl; + pte_t ptep; - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; - if (check_pte_range(vma, pmd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pmd++, addr = next, addr != end); - return 0; -} + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + ptep = huge_ptep_get(walk->mm, addr, pte); + if (!pte_present(ptep)) { + if (!huge_pte_none(ptep)) { + const softleaf_t entry = softleaf_from_pte(ptep); -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pud_t *pud; - unsigned long next; + if (unlikely(softleaf_is_migration(entry))) + qp->nr_failed++; + } - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - if (check_pmd_range(vma, pud, addr, next, nodes, - flags, private)) - return -EIO; - } while (pud++, addr = next, addr != end); + goto unlock; + } + folio = pfn_folio(pte_pfn(ptep)); + if (!queue_folio_required(folio, qp)) + goto unlock; + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma)) { + qp->nr_failed++; + goto unlock; + } + /* + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. + * + * See folio_maybe_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. + */ + if ((flags & MPOL_MF_MOVE_ALL) || + (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) + if (!folio_isolate_hugetlb(folio, qp->pagelist)) + qp->nr_failed++; +unlock: + spin_unlock(ptl); + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; +#endif return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) +#ifdef CONFIG_NUMA_BALANCING +/** + * folio_can_map_prot_numa() - check whether the folio can map prot numa + * @folio: The folio whose mapping considered for being made NUMA hintable + * @vma: The VMA that the folio belongs to. + * @is_private_single_threaded: Is this a single-threaded private VMA or not + * + * This function checks to see if the folio actually indicates that + * we need to make the mapping one which causes a NUMA hinting fault, + * as there are cases where it's simply unnecessary, and the folio's + * access time is adjusted for memory tiering if prot numa needed. + * + * Return: True if the mapping of the folio needs to be changed, false otherwise. + */ +bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma, + bool is_private_single_threaded) { - pgd_t *pgd; - unsigned long next; + int nid; - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - if (check_pud_range(vma, pgd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pgd++, addr = next, addr != end); - return 0; + if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio)) + return false; + + /* Also skip shared copy-on-write folios */ + if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio)) + return false; + + /* Folios are pinned and can't be migrated */ + if (folio_maybe_dma_pinned(folio)) + return false; + + /* + * While migration can move some dirty folios, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) + return false; + + /* + * Don't mess with PTEs if folio is already on the node + * a single-threaded process is running on. + */ + nid = folio_nid(folio); + if (is_private_single_threaded && (nid == numa_node_id())) + return false; + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(nid)) + return false; + + if (folio_use_access_time(folio)) + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); + + return true; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -586,95 +885,130 @@ static inline int check_pgd_range(struct vm_area_struct *vma, unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { - int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); + struct mmu_gather tlb; + long nr_updated; + + tlb_gather_mmu(&tlb, vma->vm_mm); - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); - if (nr_updated) + nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); + if (nr_updated > 0) { count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); + } + + tlb_finish_mmu(&tlb); return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +#endif /* CONFIG_NUMA_BALANCING */ + +static int queue_pages_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) { - return 0; + struct vm_area_struct *next, *vma = walk->vma; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + + /* range check first */ + VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); + + if (!qp->first) { + qp->first = vma; + if (!(flags & MPOL_MF_DISCONTIG_OK) && + (qp->start < vma->vm_start)) + /* hole at head side of range */ + return -EFAULT; + } + next = find_vma(vma->vm_mm, vma->vm_end); + if (!(flags & MPOL_MF_DISCONTIG_OK) && + ((vma->vm_end < qp->end) && + (!next || vma->vm_end < next->vm_start))) + /* hole at middle or tail of range */ + return -EFAULT; + + /* + * Need check MPOL_MF_STRICT to return -EIO if possible + * regardless of vma_migratable + */ + if (!vma_migratable(vma) && + !(flags & MPOL_MF_STRICT)) + return 1; + + /* + * Check page nodes, and queue pages to move, in the current vma. + * But if no moving, and no strict checking, the scan can be skipped. + */ + if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + return 0; + return 1; } -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ + +static const struct mm_walk_ops queue_pages_walk_ops = { + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, + .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, + .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_WRLOCK, +}; /* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are not on the required set of @nodes, + * and migration is allowed, they are isolated and queued to @pagelist. + * + * queue_pages_range() may return: + * 0 - all pages already on the right node, or successfully queued for moving + * (or neither strict checking nor moving requested: only range checking). + * >0 - this number of misplaced folios could not be queued for moving + * (a hugetlbfs page or a transparent huge page being counted as 1). + * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. + * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ -static struct vm_area_struct * -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - const nodemask_t *nodes, unsigned long flags, void *private) +static long +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, + nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) { int err; - struct vm_area_struct *first, *vma, *prev; - - - first = find_vma(mm, start); - if (!first) - return ERR_PTR(-EFAULT); - prev = NULL; - for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { - unsigned long endvma = vma->vm_end; - - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); - if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); - } - - if (is_vm_hugetlb_page(vma)) - goto next; + struct queue_pages qp = { + .pagelist = pagelist, + .flags = flags, + .nmask = nodes, + .start = start, + .end = end, + .first = NULL, + }; + const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? + &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; - if (flags & MPOL_MF_LAZY) { - change_prot_numa(vma, start, endvma); - goto next; - } + err = walk_page_range(mm, start, end, ops, &qp); - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) { + if (!qp.first) + /* whole range in hole */ + err = -EFAULT; - err = check_pgd_range(vma, start, endvma, nodes, - flags, private); - if (err) { - first = ERR_PTR(err); - break; - } - } -next: - prev = vma; - } - return first; + return err ? : qp.nr_failed; } /* * Apply policy to a single VMA - * This must be called with the mmap_sem held for writing. + * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, - struct mempolicy *pol) + struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); + vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) @@ -687,7 +1021,7 @@ static int vma_replace_policy(struct vm_area_struct *vma, } old = vma->vm_policy; - vma->vm_policy = new; /* protected by mmap_sem */ + vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; @@ -696,91 +1030,32 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - struct vm_area_struct *next; - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; - pgoff_t pgoff; - unsigned long vmstart; - unsigned long vmend; - - vma = find_vma(mm, start); - if (!vma || vma->vm_start > start) - return -EFAULT; - - prev = vma->vm_prev; - if (start > vma->vm_start) - prev = vma; + unsigned long vmstart, vmend; - for (; vma && vma->vm_start < end; prev = vma, vma = next) { - next = vma->vm_next; - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - continue; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol); - if (prev) { - vma = prev; - next = vma->vm_next; - continue; - } - if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); - if (err) - goto out; - } - if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); - if (err) - goto out; - } - err = vma_replace_policy(vma, new_pol); - if (err) - goto out; + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; } - out: - return err; -} - -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ + if (mpol_equal(vma->vm_policy, new_pol)) { + *prev = vma; + return 0; + } -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} + vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); + if (IS_ERR(vma)) + return PTR_ERR(vma); -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -788,7 +1063,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; - struct mm_struct *mm = current->mm; NODEMASK_SCRATCH(scratch); int ret; @@ -800,33 +1074,23 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, ret = PTR_ERR(new); goto out; } - /* - * prevent changing our mempolicy while show_numa_maps() - * is using it. - * Note: do_set_mempolicy() can be called at init time - * with no 'mm'. - */ - if (mm) - down_write(&mm->mmap_sem); + task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); mpol_put(new); goto out; } + old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); - if (new && new->mode == MPOL_INTERLEAVE && - nodes_weight(new->v.nodes)) - current->il_next = first_node(new->v.nodes); + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE)) { + current->il_prev = MAX_NUMNODES-1; + current->il_weight = 0; + } task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); - mpol_put(old); ret = 0; out: @@ -839,22 +1103,22 @@ out: * * Called with task's alloc_lock held */ -static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (pol == &default_policy) return; - switch (p->mode) { + switch (pol->mode) { case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: - *nodes = p->v.nodes; - break; case MPOL_PREFERRED: - if (!(p->flags & MPOL_F_LOCAL)) - node_set(p->v.preferred_node, *nodes); - /* else return empty node mask for local allocation */ + case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: + *nodes = pol->nodes; + break; + case MPOL_LOCAL: + /* return empty node mask for local allocation */ break; default: BUG(); @@ -863,15 +1127,15 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) static int lookup_node(struct mm_struct *mm, unsigned long addr) { - struct page *p; - int err; + struct page *p = NULL; + int ret; - err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); - if (err >= 0) { - err = page_to_nid(p); + ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); + if (ret > 0) { + ret = page_to_nid(p); put_page(p); } - return err; + return ret; } /* Retrieve NUMA policy */ @@ -881,7 +1145,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -898,21 +1162,19 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ - down_read(&mm->mmap_sem); - vma = find_vma_intersection(mm, addr, addr+1); + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); if (!vma) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); - else - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; @@ -921,13 +1183,29 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { + /* + * Take a refcount on the mpol, because we are about to + * drop the mmap_lock, after which only "pol" remains + * valid, "vma" is stale. + */ + pol_refcount = pol; + vma = NULL; + mpol_get(pol); + mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { - *policy = current->il_next; + *policy = next_node_in(current->il_prev, pol->nodes); + } else if (pol == current->mempolicy && + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + if (current->il_weight) + *policy = current->il_prev; + else + *policy = next_node_in(current->il_prev, + pol->nodes); } else { err = -EINVAL; goto out; @@ -942,11 +1220,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy |= (pol->flags & MPOL_MODE_FLAGS); } - if (vma) { - up_read(¤t->mm->mmap_sem); - vma = NULL; - } - err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { @@ -961,64 +1234,91 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, out: mpol_cond_put(pol); if (vma) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(mm); + if (pol_refcount) + mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION -/* - * page migration - */ -static void migrate_page_add(struct page *page, struct list_head *pagelist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* - * Avoid migrating a page that is shared with others. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. + * + * See folio_maybe_mapped_shared() on possible imprecision when we + * cannot easily detect if a folio is shared. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (!isolate_lru_page(page)) { - list_add_tail(&page->lru, pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) { + if (folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + } else { + /* + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be + * isolated, so they can't be moved at the moment. + */ + return false; } } -} - -static struct page *new_node_page(struct page *page, unsigned long node, int **x) -{ - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ -static int migrate_to_node(struct mm_struct *mm, int source, int dest, - int flags) +static long migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { nodemask_t nmask; + struct vm_area_struct *vma; LIST_HEAD(pagelist); - int err = 0; + long nr_failed; + long err = 0; + struct migration_target_control mtc = { + .nid = dest, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + .reason = MR_SYSCALL, + }; nodes_clear(nmask); node_set(source, nmask); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); + vma = find_vma(mm, 0); + if (unlikely(!vma)) { + mmap_read_unlock(mm); + return 0; + } + /* - * This does not "check" the range but isolates all pages that + * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address - * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, + * but passes back the count of pages which could not be isolated. */ - VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, - MIGRATE_SYNC, MR_SYSCALL); + err = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } + if (err >= 0) + err += nr_failed; return err; } @@ -1031,19 +1331,11 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { - int busy = 0; - int err; + long nr_failed = 0; + long err = 0; nodemask_t tmp; - err = migrate_prep(); - if (err) - return err; - - down_read(&mm->mmap_sem); - - err = migrate_vmas(mm, from, to, flags); - if (err) - goto out; + lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' @@ -1078,8 +1370,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { - int s,d; - int source = -1; + int s, d; + int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { @@ -1114,53 +1406,63 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (!node_isset(dest, tmp)) break; } - if (source == -1) + if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) - busy += err; + nr_failed += err; if (err < 0) break; } -out: - up_read(&mm->mmap_sem); + + lru_cache_enable(); if (err < 0) return err; - return busy; - + return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* - * Allocate a new page for page migration based on vma policy. - * Start assuming that page is mapped by vma pointed to by @private. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct vm_area_struct *vma = (struct vm_area_struct *)private; - unsigned long uninitialized_var(address); + struct migration_mpol *mmpol = (struct migration_mpol *)private; + struct mempolicy *pol = mmpol->pol; + pgoff_t ilx = mmpol->ilx; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp; - while (vma) { - address = page_address_in_vma(page, vma); - if (address != -EFAULT) - break; - vma = vma->vm_next; + order = folio_order(src); + ilx += src->index >> order; + + if (folio_test_hugetlb(src)) { + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp, + htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND)); } - /* - * if !vma, alloc_page_vma() will use task or system default policy - */ - return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (folio_test_large(src)) + gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; + + return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else -static void migrate_page_add(struct page *page, struct list_head *pagelist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { + return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, @@ -1169,7 +1471,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1179,11 +1482,14 @@ static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { - struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; + struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; - int err; + long err; + long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) @@ -1197,7 +1503,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) @@ -1209,9 +1515,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); - if (flags & MPOL_MF_LAZY) - new->flags |= MPOL_F_MOF; - /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1219,25 +1522,15 @@ static long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", - start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - - err = migrate_prep(); - if (err) - goto mpol_out; - } + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { - down_write(&mm->mmap_sem); - task_lock(current); + mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); - task_unlock(current); if (err) - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); @@ -1245,48 +1538,121 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = check_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); + /* + * Lock the VMAs before scanning for pages to migrate, + * to ensure we don't miss a concurrently inserted page. + */ + nr_failed = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); - err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma)) - err = mbind_range(mm, start, end, new); + if (nr_failed < 0) { + err = nr_failed; + nr_failed = 0; + } else { + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } + } - if (!err) { - int nr_failed = 0; + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); + } + mmpol.pol = new; + mmpol.ilx = 0; - if (!list_empty(&pagelist)) { - WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, - MIGRATE_SYNC, MR_MEMPOLICY_MBIND); - if (nr_failed) - putback_lru_pages(&pagelist); + /* + * In the interleaved case, attempt to allocate on exactly the + * targeted nodes, for the first VMA to be migrated; for later + * VMAs, the nodes will still be interleaved from the targeted + * nodemask, but one by one may be selected differently. + */ + if (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE) { + struct folio *folio; + unsigned int order; + unsigned long addr = -EFAULT; + + list_for_each_entry(folio, &pagelist, lru) { + if (!folio_test_ksm(folio)) + break; + } + if (!list_entry_is_head(folio, &pagelist, lru)) { + vma_iter_init(&vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + addr = page_address_in_vma(folio, + folio_page(folio, 0), vma); + if (addr != -EFAULT) + break; + } + } + if (addr != -EFAULT) { + order = folio_order(folio); + /* We already know the pol, but not the ilx */ + mpol_cond_put(get_vma_policy(vma, addr, order, + &mmpol.ilx)); + /* Set base from which to increment by index */ + mmpol.ilx -= folio->index >> order; + } } + } - if (nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; - } else - putback_lru_pages(&pagelist); + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)&mmpol, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); + } - up_write(&mm->mmap_sem); - mpol_out: + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); +mpol_out: mpol_put(new); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ +static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long nlongs = BITS_TO_LONGS(maxnode); + int ret; + + if (in_compat_syscall()) + ret = compat_get_bitmap(mask, + (const compat_ulong_t __user *)nmask, + maxnode); + else + ret = copy_from_user(mask, nmask, + nlongs * sizeof(unsigned long)); + + if (ret) + return -EFAULT; + + if (maxnode % BITS_PER_LONG) + mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; + + return 0; +} /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { - unsigned long k; - unsigned long nlongs; - unsigned long endmask; - --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) @@ -1294,35 +1660,29 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + /* + * When the user specified more nodes than supported just check + * if the non supported part is all zero, one word at a time, + * starting at the end. + */ + while (maxnode > MAX_NUMNODES) { + unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); + unsigned long t; - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; + if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) + return -EFAULT; + + if (maxnode - bits >= MAX_NUMNODES) { + maxnode -= bits; + } else { + maxnode = MAX_NUMNODES; + t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; + if (t) + return -EINVAL; } - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - return 0; + return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ @@ -1330,7 +1690,11 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); + bool compat = in_compat_syscall(); + + if (compat) + nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1338,56 +1702,160 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; + maxnode = nr_node_ids; } + + if (compat) + return compat_put_bitmap((compat_ulong_t __user *)mask, + nodes_addr(*nodes), maxnode); + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } -SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, unsigned long __user *, nmask, - unsigned long, maxnode, unsigned, flags) +/* Basic parameter sanity check used by both mbind() and set_mempolicy() */ +static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { - nodemask_t nodes; - int err; - unsigned short mode_flags; + *flags = *mode & MPOL_MODE_FLAGS; + *mode &= ~MPOL_MODE_FLAGS; - mode_flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if (mode >= MPOL_MAX) + if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; - if ((mode_flags & MPOL_F_STATIC_NODES) && - (mode_flags & MPOL_F_RELATIVE_NODES)) + if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; + if (*flags & MPOL_F_NUMA_BALANCING) { + if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY) + *flags |= (MPOL_F_MOF | MPOL_F_MORON); + else + return -EINVAL; + } + return 0; +} + +static long kernel_mbind(unsigned long start, unsigned long len, + unsigned long mode, const unsigned long __user *nmask, + unsigned long maxnode, unsigned int flags) +{ + unsigned short mode_flags; + nodemask_t nodes; + int lmode = mode; + int err; + + start = untagged_addr(start); + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; + err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_mbind(start, len, mode, mode_flags, &nodes, flags); + + return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } -/* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, - unsigned long, maxnode) +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) { - int err; - nodemask_t nodes; - unsigned short flags; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct mempolicy *new, *old; + unsigned long end; + int err = -ENOENT; + VMA_ITERATOR(vmi, mm, start); + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; - flags = mode & MPOL_MODE_FLAGS; - mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)mode >= MPOL_MAX) + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; - if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) + + len = PAGE_ALIGN(len); + end = start + len; + + if (end < start) return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + old = vma_policy(vma); + if (!old) { + prev = vma; + continue; + } + if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { + err = -EOPNOTSUPP; + break; + } + new = mpol_dup(old); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + + vma_start_write(vma); + new->home_node = home_node; + err = mbind_range(&vmi, vma, &prev, start, end, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, const unsigned long __user *, nmask, + unsigned long, maxnode, unsigned int, flags) +{ + return kernel_mbind(start, len, mode, nmask, maxnode, flags); +} + +/* Set the process memory policy */ +static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned short mode_flags; + nodemask_t nodes; + int lmode = mode; + int err; + + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; + err = get_nodes(&nodes, nmask, maxnode); if (err) return err; - return do_set_mempolicy(mode, flags, &nodes); + + return do_set_mempolicy(lmode, mode_flags, &nodes); } -SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, - const unsigned long __user *, old_nodes, - const unsigned long __user *, new_nodes) +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, + unsigned long, maxnode) +{ + return kernel_set_mempolicy(mode, nmask, maxnode); +} + +static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) { - const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; @@ -1423,15 +1891,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, err = -EINVAL; /* - * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same - * userid as the target process. + * Check if this process has the right to modify the specified process. + * Use the regular "ptrace_may_access()" checks. */ - tcred = __task_cred(task); - if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && - !capable(CAP_SYS_NICE)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; @@ -1445,10 +1908,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out_put; } - if (!nodes_subset(*new, node_states[N_MEMORY])) { - err = -EINVAL; + task_nodes = cpuset_mems_allowed(current); + nodes_and(*new, *new, task_nodes); + if (nodes_empty(*new)) goto out_put; - } err = security_task_movememory(task); if (err) @@ -1474,22 +1937,31 @@ out: out_put: put_task_struct(task); goto out; - } +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) +{ + return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); +} /* Retrieve NUMA policy */ -SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, - unsigned long __user *, nmask, unsigned long, maxnode, - unsigned long, addr, unsigned long, flags) +static int kernel_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, + unsigned long flags) { int err; - int uninitialized_var(pval); + int pval; nodemask_t nodes; - if (nmask != NULL && maxnode < MAX_NUMNODES) + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; + addr = untagged_addr(addr); + err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) @@ -1504,347 +1976,360 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, return err; } -#ifdef CONFIG_COMPAT - -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) { - long err; - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) - nm = compat_alloc_user_space(alloc_size); - - err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); - - if (!err && nmask) { - unsigned long copy_size; - copy_size = min_t(unsigned long, sizeof(bm), alloc_size); - err = copy_from_user(bm, nm, copy_size); - /* ensure entire bitmap is zeroed */ - err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); - err |= compat_put_bitmap(nmask, bm, nr_bits); - } - - return err; + return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +bool vma_migratable(struct vm_area_struct *vma) { - long err = 0; - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return false; - if (nmask) { - err = compat_get_bitmap(bm, nmask, nr_bits); - nm = compat_alloc_user_space(alloc_size); - err |= copy_to_user(nm, bm, alloc_size); - } + /* + * DAX device mappings require predictable access latency, so avoid + * incurring periodic faults. + */ + if (vma_is_dax(vma)) + return false; - if (err) - return -EFAULT; + if (is_vm_hugetlb_page(vma) && + !hugepage_migration_supported(hstate_vma(vma))) + return false; - return sys_set_mempolicy(mode, nm, nr_bits+1); + /* + * Migration allocates pages in the highest zone. If we cannot + * do so then migration (at least from node to node) is not + * possible. + */ + if (vma->vm_file && + gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) + < policy_zone) + return false; + return true; } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *ilx) { - long err = 0; - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - nodemask_t bm; - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); - nm = compat_alloc_user_space(alloc_size); - err |= copy_to_user(nm, nodes_addr(bm), alloc_size); - } - - if (err) - return -EFAULT; - - return sys_mbind(start, len, mode, nm, nr_bits+1, flags); + *ilx = 0; + return (vma->vm_ops && vma->vm_ops->get_policy) ? + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } -#endif - /* - * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup + * get_vma_policy(@vma, @addr, @order, @ilx) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup + * @order: 0, or appropriate huge_page_order for interleaving + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or + * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. - * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies must be - * protected by task_lock(task) by the caller. + * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -struct mempolicy *get_vma_policy(struct task_struct *task, - struct vm_area_struct *vma, unsigned long addr) +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol = get_task_policy(task); + struct mempolicy *pol; - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - struct mempolicy *vpol = vma->vm_ops->get_policy(vma, - addr); - if (vpol) - pol = vpol; - } else if (vma->vm_policy) { - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, ilx); + if (!pol) + pol = get_task_policy(current); + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + *ilx += vma->vm_pgoff >> order; + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } + return pol; +} - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } +bool vma_policy_mof(struct vm_area_struct *vma) +{ + struct mempolicy *pol; + + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + pgoff_t ilx; /* ignored here */ + + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; } + + pol = vma->vm_policy; if (!pol) - pol = &default_policy; - return pol; + pol = get_task_policy(current); + + return pol->flags & MPOL_F_MOF; } -static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* - * if policy->v.nodes has movable memory only, + * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * - * policy->v.nodes is intersect with node_states[N_MEMORY]. - * so if the following test faile, it implies - * policy->v.nodes has movable memory only. + * policy->nodes is intersect with node_states[N_MEMORY]. + * so if the following test fails, it implies + * policy->nodes has movable memory only. */ - if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) + if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } -/* - * Return a nodemask representing a mempolicy for filtering nodes for - * page allocation - */ -static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) +static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { - /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(policy->mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) - return &policy->v.nodes; - - return NULL; -} + unsigned int node; + unsigned int cpuset_mems_cookie; -/* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, - int nd) -{ - switch (policy->mode) { - case MPOL_PREFERRED: - if (!(policy->flags & MPOL_F_LOCAL)) - nd = policy->v.preferred_node; - break; - case MPOL_BIND: - /* - * Normally, MPOL_BIND allocations are node-local within the - * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node isn't part of the mask, we use the zonelist for - * the first node in the mask instead. - */ - if (unlikely(gfp & __GFP_THISNODE) && - unlikely(!node_isset(nd, policy->v.nodes))) - nd = first_node(policy->v.nodes); - break; - default: - BUG(); +retry: + /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ + cpuset_mems_cookie = read_mems_allowed_begin(); + node = current->il_prev; + if (!current->il_weight || !node_isset(node, policy->nodes)) { + node = next_node_in(node, policy->nodes); + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry; + if (node == MAX_NUMNODES) + return node; + current->il_prev = node; + current->il_weight = get_il_weight(node); } - return node_zonelist(nd, gfp); + current->il_weight--; + return node; } /* Do dynamic interleaving for a process */ -static unsigned interleave_nodes(struct mempolicy *policy) +static unsigned int interleave_nodes(struct mempolicy *policy) { - unsigned nid, next; - struct task_struct *me = current; + unsigned int nid; + unsigned int cpuset_mems_cookie; + + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nid = next_node_in(current->il_prev, policy->nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); - nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); - if (next < MAX_NUMNODES) - me->il_next = next; + if (nid < MAX_NUMNODES) + current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); - if (in_interrupt()) - return numa_node_id(); + if (!in_task()) + return node; policy = current->mempolicy; - if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + if (!policy) + return node; switch (policy->mode) { case MPOL_PREFERRED: - /* - * handled MPOL_F_LOCAL above - */ - return policy->v.preferred_node; + return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: { + case MPOL_WEIGHTED_INTERLEAVE: + return weighted_interleave_nodes(policy); + + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { + struct zoneref *z; + /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; - struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; - (void)first_zones_zonelist(zonelist, highest_zoneidx, - &policy->v.nodes, - &zone); - return zone ? zone->node : numa_node_id(); + zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; + z = first_zones_zonelist(zonelist, highest_zoneidx, + &policy->nodes); + return zonelist_zone(z) ? zonelist_node_idx(z) : node; } + case MPOL_LOCAL: + return node; default: BUG(); } } -/* Do static interleaving for a VMA with known offset. */ -static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long off) +static unsigned int read_once_policy_nodemask(struct mempolicy *pol, + nodemask_t *mask) { - unsigned nnodes = nodes_weight(pol->v.nodes); - unsigned target; - int c; - int nid = -1; + /* + * barrier stabilizes the nodemask locally so that it can be iterated + * over safely without concern for changes. Allocators validate node + * selection does not violate mems_allowed, so this is safe. + */ + barrier(); + memcpy(mask, &pol->nodes, sizeof(nodemask_t)); + barrier(); + return nodes_weight(*mask); +} - if (!nnodes) +static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) +{ + struct weighted_interleave_state *state; + nodemask_t nodemask; + unsigned int target, nr_nodes; + u8 *table = NULL; + unsigned int weight_total = 0; + u8 weight; + int nid = 0; + + nr_nodes = read_once_policy_nodemask(pol, &nodemask); + if (!nr_nodes) return numa_node_id(); - target = (unsigned int)off % nnodes; - c = 0; - do { - nid = next_node(nid, pol->v.nodes); - c++; - } while (c <= target); + + rcu_read_lock(); + + state = rcu_dereference(wi_state); + /* Uninitialized wi_state means we should assume all weights are 1 */ + if (state) + table = state->iw_table; + + /* calculate the total weight */ + for_each_node_mask(nid, nodemask) + weight_total += table ? table[nid] : 1; + + /* Calculate the node offset based on totals */ + target = ilx % weight_total; + nid = first_node(nodemask); + while (target) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + rcu_read_unlock(); return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) +/* + * Do static interleaving for interleave index @ilx. Returns the ilx'th + * node in pol->nodes (starting from ilx=0), wrapping around if ilx + * exceeds the number of present nodes. + */ +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { - if (vma) { - unsigned long off; + nodemask_t nodemask; + unsigned int target, nnodes; + int i; + int nid; - /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. - */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, vma, off); - } else - return interleave_nodes(pol); + nnodes = read_once_policy_nodemask(pol, &nodemask); + if (!nnodes) + return numa_node_id(); + target = ilx % nnodes; + nid = first_node(nodemask); + for (i = 0; i < target; i++) + nid = next_node(nid, nodemask); + return nid; } /* - * Return the bit number of a random bit set in the nodemask. - * (returns -1 if nodemask is empty) + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation, together with preferred node id (or the input node id). */ -int node_random(const nodemask_t *maskp) +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid) { - int w, bit = -1; + nodemask_t *nodemask = NULL; + + switch (pol->mode) { + case MPOL_PREFERRED: + /* Override input node id */ + *nid = first_node(pol->nodes); + break; + case MPOL_PREFERRED_MANY: + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + break; + case MPOL_BIND: + /* Restrict to nodemask (but not on lower zones) */ + if (apply_policy_zone(pol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + /* + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. + */ + WARN_ON_ONCE(gfp & __GFP_THISNODE); + break; + case MPOL_INTERLEAVE: + /* Override input node id */ + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + interleave_nodes(pol) : interleave_nid(pol, ilx); + break; + case MPOL_WEIGHTED_INTERLEAVE: + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + weighted_interleave_nodes(pol) : + weighted_interleave_nid(pol, ilx); + break; + } - w = nodes_weight(*maskp); - if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); - return bit; + return nodemask; } #ifdef CONFIG_HUGETLBFS /* - * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * huge_node(@vma, @addr, @gfp_flags, @mpol) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * - * Returns a zonelist suitable for a huge page allocation and a pointer + * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. - * If the effective policy is 'BIND, returns a pointer to the mempolicy's - * @nodemask for filtering the zonelist. - * - * Must be protected by get_mems_allowed() + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags, struct mempolicy **mpol, - nodemask_t **nodemask) +int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask) { - struct zonelist *zl; - - *mpol = get_vma_policy(current, vma, addr); - *nodemask = NULL; /* assume !MPOL_BIND */ + pgoff_t ilx; + int nid; - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { - zl = node_zonelist(interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))), gfp_flags); - } else { - zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); - if ((*mpol)->mode == MPOL_BIND) - *nodemask = &(*mpol)->v.nodes; - } - return zl; + nid = numa_node_id(); + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); + return nid; } /* @@ -1866,7 +2351,6 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; - int nid; if (!(mask && current->mempolicy)) return false; @@ -1875,17 +2359,15 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: - if (mempolicy->flags & MPOL_F_LOCAL) - nid = numa_node_id(); - else - nid = mempolicy->v.preferred_node; - init_nodemask_of_node(mask, nid); - break; - + case MPOL_PREFERRED_MANY: case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: - *mask = mempolicy->v.nodes; + case MPOL_WEIGHTED_INTERLEAVE: + *mask = mempolicy->nodes; + break; + + case MPOL_LOCAL: + init_nodemask_of_node(mask, numa_node_id()); break; default: @@ -1898,16 +2380,16 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) #endif /* - * mempolicy_nodemask_intersects + * mempolicy_in_oom_domain * - * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default - * policy. Otherwise, check for intersection between mask and the policy - * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' - * policy, always return true since it may allocate elsewhere on fallback. + * If tsk's mempolicy is "bind", check for intersection between mask and + * the policy nodemask. Otherwise, return true for all other policies + * including "interleave", as a tsk with "interleave" policy may have + * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ -bool mempolicy_nodemask_intersects(struct task_struct *tsk, +bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; @@ -1915,151 +2397,413 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk, if (!mask) return ret; + task_lock(tsk); mempolicy = tsk->mempolicy; - if (!mempolicy) - goto out; - - switch (mempolicy->mode) { - case MPOL_PREFERRED: - /* - * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to - * allocate from, they may fallback to other nodes when oom. - * Thus, it's possible for tsk to have allocated memory from - * nodes in mask. - */ - break; - case MPOL_BIND: - case MPOL_INTERLEAVE: - ret = nodes_intersects(mempolicy->v.nodes, *mask); - break; - default: - BUG(); - } -out: + if (mempolicy && mempolicy->mode == MPOL_BIND) + ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); + return ret; } -/* Allocate a page in interleaved policy. - Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, nodemask_t *nodemask) { - struct zonelist *zl; struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); + if (!page) + page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); - zl = node_zonelist(nid, gfp); - page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) - inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } /** - * alloc_pages_vma - Allocate a page for a VMA. - * - * @gfp: - * %GFP_USER user allocation. - * %GFP_KERNEL kernel allocations, - * %GFP_HIGHMEM highmem/user allocations, - * %GFP_FS allocation should not call back into a file system. - * %GFP_ATOMIC don't sleep. - * - * @order:Order of the GFP allocation. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual Address of the allocation. Must be inside the VMA. + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. + * @gfp: GFP flags. + * @order: Order of the page allocation. + * @pol: Pointer to the NUMA mempolicy. + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * - * This function allocates a page from the kernel page pool and applies - * a NUMA policy associated with the VMA or the current process. - * When VMA is not NULL caller must hold down_read on the mmap_sem of the - * mm_struct of the VMA to prevent it from going away. Should be used for - * all allocations for pages that will be mapped into - * user space. Returns NULL when no page can be allocated. - * - * Should be called with the mm_sem of the vma hold. + * Return: The page on success or NULL if allocation fails. */ -struct page * -alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node) +static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) { - struct mempolicy *pol; + nodemask_t *nodemask; struct page *page; - unsigned int cpuset_mems_cookie; -retry_cpuset: - pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { - unsigned nid; + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_preferred_many(gfp, order, nid, nodemask); + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + /* filter "hugepage" allocation, unless from alloc_pages() */ + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { + /* + * For hugepage allocation and non-interleave policy which + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. + * + * If the policy is interleave or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + if (pol->mode != MPOL_INTERLEAVE && + pol->mode != MPOL_WEIGHTED_INTERLEAVE && + (!nodemask || node_isset(nid, *nodemask))) { + /* + * First, try to allocate THP only on local node, but + * don't reclaim unnecessarily, just compact. + */ + page = __alloc_frozen_pages_noprof( + gfp | __GFP_THISNODE | __GFP_NORETRY, order, + nid, NULL); + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) + return page; + /* + * If hugepage allocations are configured to always + * synchronous compact or the vma has been madvised + * to prefer hugepage backing, retry allowing remote + * memory with both reclaim and compact as well. + */ + } + } + + page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); + + if (unlikely(pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ + if (static_branch_likely(&vm_numa_stat_key) && + page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); + } + } - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; - - return page; - } - page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, node), - policy_nodemask(gfp, pol)); - if (unlikely(mpol_needs_cond_ref(pol))) - __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; return page; } +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) +{ + struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, + ilx, nid); + if (!page) + return NULL; + + set_page_refcounted(page); + return page_rmappable_folio(page); +} + /** - * alloc_pages_current - Allocate pages. + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA. + * @addr: Virtual address of the allocation. Must be inside @vma. * - * @gfp: - * %GFP_USER user allocation, - * %GFP_KERNEL kernel allocation, - * %GFP_HIGHMEM highmem allocation, - * %GFP_FS don't call back into a file system. - * %GFP_ATOMIC don't sleep. - * @order: Power of two of allocation size in pages. 0 is a single page. + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the + * VMA to prevent it from going away. Should be used for all allocations + * for folios that will be mapped into user space, excepting hugetlbfs, and + * excepting where direct use of folio_alloc_mpol() is more appropriate. * - * Allocate a page from the kernel page pool. When not in - * interrupt context and apply the current process NUMA policy. - * Returns NULL when no page can be allocated. - * - * Don't call cpuset_update_task_memory_state() unless - * 1) it's ok to take cpuset_sem (can WAIT), and - * 2) allocating for current task (not interrupt). + * Return: The folio on success or NULL if allocation fails. */ -struct page *alloc_pages_current(gfp_t gfp, unsigned order) +struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { - struct mempolicy *pol = get_task_policy(current); - struct page *page; - unsigned int cpuset_mems_cookie; + struct mempolicy *pol; + pgoff_t ilx; + struct folio *folio; - if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) - pol = &default_policy; + if (vma->vm_flags & VM_DROPPABLE) + gfp |= __GFP_NOWARN; -retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + pol = get_vma_policy(vma, addr, order, &ilx); + folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); + mpol_cond_put(pol); + return folio; +} +EXPORT_SYMBOL(vma_alloc_folio_noprof); + +struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); - else - page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; + return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); +} +/** + * alloc_pages - Allocate pages. + * @gfp: GFP flags. + * @order: Power of two of number of pages to allocate. + * + * Allocate 1 << @order contiguous pages. The physical address of the + * first page is naturally aligned (eg an order-3 allocation will be aligned + * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current + * process is honoured when in process context. + * + * Context: Can be called from any context, providing the appropriate GFP + * flags are used. + * Return: The page on success or NULL if allocation fails. + */ +struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_frozen_pages_noprof(gfp, order); + + if (page) + set_page_refcounted(page); return page; } -EXPORT_SYMBOL(alloc_pages_current); +EXPORT_SYMBOL(alloc_pages_noprof); + +struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) +{ + return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order)); +} +EXPORT_SYMBOL(folio_alloc_noprof); + +static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + int nodes; + unsigned long nr_pages_per_node; + int delta; + int i; + unsigned long nr_allocated; + unsigned long total_allocated = 0; + + nodes = nodes_weight(pol->nodes); + nr_pages_per_node = nr_pages / nodes; + delta = nr_pages - nodes * nr_pages_per_node; + + for (i = 0; i < nodes; i++) { + if (delta) { + nr_allocated = alloc_pages_bulk_noprof(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node + 1, + page_array); + delta--; + } else { + nr_allocated = alloc_pages_bulk_noprof(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node, page_array); + } + + page_array += nr_allocated; + total_allocated += nr_allocated; + } + + return total_allocated; +} + +static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + struct weighted_interleave_state *state; + struct task_struct *me = current; + unsigned int cpuset_mems_cookie; + unsigned long total_allocated = 0; + unsigned long nr_allocated = 0; + unsigned long rounds; + unsigned long node_pages, delta; + u8 *weights, weight; + unsigned int weight_total = 0; + unsigned long rem_pages = nr_pages; + nodemask_t nodes; + int nnodes, node; + int resume_node = MAX_NUMNODES - 1; + u8 resume_weight = 0; + int prev_node; + int i; + + if (!nr_pages) + return 0; + + /* read the nodes onto the stack, retry if done during rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nnodes = read_once_policy_nodemask(pol, &nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + /* if the nodemask has become invalid, we cannot do anything */ + if (!nnodes) + return 0; + + /* Continue allocating from most recent node and adjust the nr_pages */ + node = me->il_prev; + weight = me->il_weight; + if (weight && node_isset(node, nodes)) { + node_pages = min(rem_pages, weight); + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (rem_pages <= weight) { + me->il_weight -= rem_pages; + return total_allocated; + } + /* Otherwise we adjust remaining pages, continue from there */ + rem_pages -= weight; + } + /* clear active weight in case of an allocation failure */ + me->il_weight = 0; + prev_node = node; + + /* create a local copy of node weights to operate on outside rcu */ + weights = kzalloc(nr_node_ids, GFP_KERNEL); + if (!weights) + return total_allocated; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) { + memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8)); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + for (i = 0; i < nr_node_ids; i++) + weights[i] = 1; + } + + /* calculate total, detect system default usage */ + for_each_node_mask(node, nodes) + weight_total += weights[node]; + + /* + * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. + * Track which node weighted interleave should resume from. + * + * if (rounds > 0) and (delta == 0), resume_node will always be + * the node following prev_node and its weight. + */ + rounds = rem_pages / weight_total; + delta = rem_pages % weight_total; + resume_node = next_node_in(prev_node, nodes); + resume_weight = weights[resume_node]; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, nodes); + weight = weights[node]; + node_pages = weight * rounds; + /* If a delta exists, add this node's portion of the delta */ + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else if (delta) { + /* when delta is depleted, resume from that node */ + node_pages += delta; + resume_node = node; + resume_weight = weight - delta; + delta = 0; + } + /* node_pages can be 0 if an allocation fails and rounds == 0 */ + if (!node_pages) + break; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + if (total_allocated == nr_pages) + break; + prev_node = node; + } + me->il_prev = resume_node; + me->il_weight = resume_weight; + kfree(weights); + return total_allocated; +} + +static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + gfp_t preferred_gfp; + unsigned long nr_allocated = 0; + + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + + nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, + nr_pages, page_array); + + if (nr_allocated < nr_pages) + nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, + nr_pages - nr_allocated, + page_array + nr_allocated); + return nr_allocated; +} + +/* alloc pages bulk and mempolicy should be considered at the + * same time in some situation such as vmalloc. + * + * It can accelerate memory allocation especially interleaving + * allocate memory. + */ +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, + unsigned long nr_pages, struct page **page_array) +{ + struct mempolicy *pol = &default_policy; + nodemask_t *nodemask; + int nid; + + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + if (pol->mode == MPOL_INTERLEAVE) + return alloc_pages_bulk_interleave(gfp, pol, + nr_pages, page_array); + + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return alloc_pages_bulk_weighted_interleave( + gfp, pol, nr_pages, page_array); + + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_bulk_preferred_many(gfp, + numa_node_id(), pol, nr_pages, page_array); + + nid = numa_node_id(); + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); + return alloc_pages_bulk_noprof(gfp, nid, nodemask, + nr_pages, page_array); +} + +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(src->vm_policy); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it @@ -2088,15 +2832,10 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) } else *new = *old; - rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - if (new->flags & MPOL_F_REBINDING) - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); - else - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + mpol_rebind_policy(new, &mems); } - rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } @@ -2110,17 +2849,21 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) return false; if (a->flags != b->flags) return false; + if (a->home_node != b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: - return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - return a->v.preferred_node == b->v.preferred_node; + case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: + return !!nodes_equal(a->nodes, b->nodes); + case MPOL_LOCAL: + return true; default: BUG(); return false; @@ -2132,14 +2875,16 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. - * They are protected by the sp->lock spinlock, which should be held + * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ -/* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ -static struct sp_node * -sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +/* + * lookup first element intersecting start-end. Caller holds sp->lock for + * reading or for writing + */ +static struct sp_node *sp_lookup(struct shared_policy *sp, + pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; @@ -2168,8 +2913,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) return rb_entry(n, struct sp_node, nd); } -/* Insert a new shared policy into the list. */ -/* Caller holds sp->lock */ +/* + * Insert a new shared policy into the list. Caller holds sp->lock for + * writing. + */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; @@ -2188,28 +2935,27 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, - new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ -struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, + pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + read_unlock(&sp->lock); return pol; } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm"); static void sp_free(struct sp_node *n) { @@ -2218,103 +2964,105 @@ static void sp_free(struct sp_node *n) } /** - * mpol_misplaced - check whether current page node is valid in policy - * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped + * mpol_misplaced - check whether current folio node is valid in policy * - * Lookup current policy node id for vma,addr and "compare to" page's - * node id. + * @folio: folio to be checked + * @vmf: structure describing the fault + * @addr: virtual address in @vma for shared policy lookup and interleave policy * - * Returns: - * -1 - not misplaced, page is in the right node - * node - node id where the page should be - * - * Policy determination "mimics" alloc_page_vma(). + * Lookup current policy node id for vma,addr and "compare to" folio's + * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. + * + * Return: NUMA_NO_NODE if the page is in a node that is valid for this + * policy, or a suitable node ID to allocate a replacement folio from. */ -int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, + unsigned long addr) { struct mempolicy *pol; - struct zone *zone; - int curnid = page_to_nid(page); - unsigned long pgoff; - int polnid = -1; - int ret = -1; + pgoff_t ilx; + struct zoneref *z; + int curnid = folio_nid(folio); + struct vm_area_struct *vma = vmf->vma; + int thiscpu = raw_smp_processor_id(); + int thisnid = numa_node_id(); + int polnid = NUMA_NO_NODE; + int ret = NUMA_NO_NODE; - BUG_ON(!vma); - - pol = get_vma_policy(current, vma, addr); + /* + * Make sure ptl is held so that we don't preempt and we + * have a stable smp processor id + */ + lockdep_assert_held(vmf->ptl); + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); + polnid = interleave_nid(pol, ilx); + break; - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, vma, pgoff); + case MPOL_WEIGHTED_INTERLEAVE: + polnid = weighted_interleave_nid(pol, ilx); break; case MPOL_PREFERRED: - if (pol->flags & MPOL_F_LOCAL) - polnid = numa_node_id(); - else - polnid = pol->v.preferred_node; + if (node_isset(curnid, pol->nodes)) + goto out; + polnid = first_node(pol->nodes); + break; + + case MPOL_LOCAL: + polnid = numa_node_id(); break; case MPOL_BIND: + case MPOL_PREFERRED_MANY: + /* + * Even though MPOL_PREFERRED_MANY can allocate pages outside + * policy nodemask we don't allow numa migration to nodes + * outside policy nodemask for now. This is done so that if we + * want demotion to slow memory to happen, before allocating + * from some DRAM node say 'x', we will end up using a + * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario + * we should not promote to node 'x' from slow memory node. + */ + if (pol->flags & MPOL_F_MORON) { + /* + * Optimize placement among multiple nodes + * via NUMA balancing + */ + if (node_isset(thisnid, pol->nodes)) + break; + goto out; + } + /* - * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ - if (node_isset(curnid, pol->v.nodes)) + if (node_isset(curnid, pol->nodes)) goto out; - (void)first_zones_zonelist( - node_zonelist(numa_node_id(), GFP_HIGHUSER), + z = first_zones_zonelist( + node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), - &pol->v.nodes, &zone); - polnid = zone->node; + &pol->nodes); + polnid = zonelist_node_idx(z); break; default: BUG(); } - /* Migrate the page towards the node whose CPU is referencing it */ + /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; - - polnid = numa_node_id(); + polnid = thisnid; - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) + if (!should_numa_migrate_memory(current, folio, curnid, + thiscpu)) goto out; } @@ -2326,9 +3074,25 @@ out: return ret; } +/* + * Drop the (possibly final) reference to task->mempolicy. It needs to be + * dropped after task->mempolicy is set to NULL so that any allocation done as + * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed + * policy. + */ +void mpol_put_task_policy(struct task_struct *task) +{ + struct mempolicy *pol; + + task_lock(task); + pol = task->mempolicy; + task->mempolicy = NULL; + task_unlock(task); + mpol_put(pol); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } @@ -2363,8 +3127,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, } /* Replace a policy range. */ -static int shared_policy_replace(struct shared_policy *sp, unsigned long start, - unsigned long end, struct sp_node *new) +static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, + pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; @@ -2372,7 +3136,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, int ret = 0; restart: - spin_lock(&sp->lock); + write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2405,7 +3169,7 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = 0; err_out: @@ -2417,7 +3181,7 @@ err_out: return ret; alloc_new: - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) @@ -2425,6 +3189,7 @@ alloc_new: mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; + atomic_set(&mpol_new->refcnt, 1); goto restart; } @@ -2443,84 +3208,81 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + rwlock_init(&sp->lock); if (mpol) { - struct vm_area_struct pvma; - struct mempolicy *new; + struct sp_node *sn; + struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; - /* contextualize the tmpfs mount point mempolicy */ - new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) + + /* contextualize the tmpfs mount point mempolicy to this file */ + npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) - goto put_new; - - /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); - pvma.vm_end = TASK_SIZE; /* policy covers entire file */ - mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ - -put_new: - mpol_put(new); /* drop initial ref */ + goto put_npol; + + /* alloc node covering entire file; adds ref to file's npol */ + sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); + if (sn) + sp_insert(sp, sn); +put_npol: + mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm"); -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, struct mempolicy *npol) +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", - vma->vm_pgoff, - sz, npol ? npol->mode : -1, - npol ? npol->flags : -1, - npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); - - if (npol) { - new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (pol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } - err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } +EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm"); /* Free a backing policy store on inode delete. */ -void mpol_free_shared_policy(struct shared_policy *p) +void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; - if (!p->root.rb_node) + if (!sp->root.rb_node) return; - spin_lock(&p->lock); - next = rb_first(&p->root); + write_lock(&sp->lock); + next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - sp_delete(p, n); + sp_delete(sp, n); } - spin_unlock(&p->lock); + write_unlock(&sp->lock); } +EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm"); #ifdef CONFIG_NUMA_BALANCING -static bool __initdata numabalancing_override; +static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { @@ -2529,9 +3291,13 @@ static void __init check_numabalancing_enable(void) if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; - if (nr_node_ids > 1 && !numabalancing_override) { - printk(KERN_INFO "Enabling automatic NUMA balancing. " - "Configure with numa_balancing= or sysctl"); + /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ + if (numabalancing_override) + set_numabalancing_state(numabalancing_override == 1); + + if (num_online_nodes() > 1 && !numabalancing_override) { + pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", + numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } @@ -2541,18 +3307,17 @@ static int __init setup_numabalancing(char *str) int ret = 0; if (!str) goto out; - numabalancing_override = true; if (!strcmp(str, "enable")) { - set_numabalancing_state(true); + numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { - set_numabalancing_state(false); + numabalancing_override = -1; ret = 1; } out: if (!ret) - printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + pr_warn("Unable to parse numa_balancing=\n"); return ret; } @@ -2563,7 +3328,6 @@ static inline void __init check_numabalancing_enable(void) } #endif /* CONFIG_NUMA_BALANCING */ -/* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; @@ -2583,7 +3347,7 @@ void __init numa_policy_init(void) .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, - .v = { .preferred_node = nid, }, + .nodes = nodemask_of_node(nid), }; } @@ -2612,7 +3376,7 @@ void __init numa_policy_init(void) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) - printk("numa_policy_init: interleaving failed\n"); + pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } @@ -2626,20 +3390,17 @@ void numa_default_policy(void) /* * Parse and format mempolicy from/to strings */ - -/* - * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. - */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", + [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", }; - #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. @@ -2649,17 +3410,19 @@ static const char * const policy_modes[] = * Format of input: * <mode>[=<flags>][:<nodelist>] * - * On success, returns 0, else 1 + * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; - unsigned short mode; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int err = 1; + int err = 1, mode; + + if (flags) + *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ @@ -2671,21 +3434,16 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } else nodes_clear(nodes); - if (flags) - *flags++ = '\0'; /* terminate mode string */ - - for (mode = 0; mode < MPOL_MAX; mode++) { - if (!strcmp(str, policy_modes[mode])) { - break; - } - } - if (mode >= MPOL_MAX) + mode = match_string(policy_modes, MPOL_MAX, str); + if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* - * Insist on a nodelist of one node only + * Insist on a nodelist of one node only, although later + * we use first_node(nodes) to grab a single node, so here + * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; @@ -2693,9 +3451,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) rest++; if (*rest) goto out; + if (nodes_empty(nodes)) + goto out; } break; case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ @@ -2708,7 +3469,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) */ if (nodelist) goto out; - mode = MPOL_PREFERRED; break; case MPOL_DEFAULT: /* @@ -2717,6 +3477,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (!nodelist) err = 0; goto out; + case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist @@ -2747,12 +3508,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ - if (mode != MPOL_PREFERRED) - new->v.nodes = nodes; - else if (nodelist) - new->v.preferred_node = first_node(nodes); - else - new->flags |= MPOL_F_LOCAL; + if (mode != MPOL_PREFERRED) { + new->nodes = nodes; + } else if (nodelist) { + nodes_clear(new->nodes); + node_set(first_node(nodes), new->nodes); + } else { + new->mode = MPOL_LOCAL; + } /* * Save nodes for contextualization: this will be used to "clone" @@ -2780,77 +3543,402 @@ out: * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 51 for the longest mode, "weighted + * interleave", plus the longest flag flags, "relative|balancing", and to + * display at least a few node ids. */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - int l; - nodemask_t nodes; - unsigned short mode; - unsigned short flags = pol ? pol->flags : 0; - - /* - * Sanity check: room for longest mode, flag and some nodes - */ - VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); - - if (!pol || pol == &default_policy) - mode = MPOL_DEFAULT; - else + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; + + if (pol && + pol != &default_policy && + !(pol >= &preferred_node_policy[0] && + pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) { mode = pol->mode; + flags = pol->flags; + } switch (mode) { case MPOL_DEFAULT: - nodes_clear(nodes); + case MPOL_LOCAL: break; - case MPOL_PREFERRED: - nodes_clear(nodes); - if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; - else - node_set(pol->v.preferred_node, nodes); - break; - + case MPOL_PREFERRED_MANY: case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: - nodes = pol->v.nodes; + case MPOL_WEIGHTED_INTERLEAVE: + nodes = pol->nodes; break; - default: - return -EINVAL; + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; } - l = strlen(policy_modes[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; - - strcpy(p, policy_modes[mode]); - p += l; + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = '='; + p += snprintf(p, buffer + maxlen - p, "="); /* - * Currently, the only defined flags are mutually exclusive + * Static and relative are mutually exclusive. */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); + + if (flags & MPOL_F_NUMA_BALANCING) { + if (!is_power_of_2(flags & MPOL_MODE_FLAGS)) + p += snprintf(p, buffer + maxlen - p, "|"); + p += snprintf(p, buffer + maxlen - p, "balancing"); + } + } + + if (!nodes_empty(nodes)) + p += scnprintf(p, buffer + maxlen - p, ":%*pbl", + nodemask_pr_args(&nodes)); +} + +#ifdef CONFIG_SYSFS +struct iw_node_attr { + struct kobj_attribute kobj_attr; + int nid; +}; + +struct sysfs_wi_group { + struct kobject wi_kobj; + struct mutex kobj_lock; + struct iw_node_attr *nattrs[]; +}; + +static struct sysfs_wi_group *wi_group; + +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct iw_node_attr *node_attr; + u8 weight; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + weight = get_il_weight(node_attr->nid); + return sysfs_emit(buf, "%d\n", weight); +} + +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + struct iw_node_attr *node_attr; + u8 weight = 0; + int i; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + if (count == 0 || sysfs_streq(buf, "") || + kstrtou8(buf, 0, &weight) || weight == 0) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; + + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (old_wi_state) { + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + } else { + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + } + new_wi_state->iw_table[node_attr->nid] = weight; + new_wi_state->mode_auto = false; + + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static ssize_t weighted_interleave_auto_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct weighted_interleave_state *state; + bool wi_auto = true; + + rcu_read_lock(); + state = rcu_dereference(wi_state); + if (state) + wi_auto = state->mode_auto; + rcu_read_unlock(); + + return sysfs_emit(buf, "%s\n", str_true_false(wi_auto)); +} + +static ssize_t weighted_interleave_auto_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL; + unsigned int *bw; + bool input; + int i; + + if (kstrtobool(buf, &input)) + return -EINVAL; + + new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids), + GFP_KERNEL); + if (!new_wi_state) + return -ENOMEM; + for (i = 0; i < nr_node_ids; i++) + new_wi_state->iw_table[i] = 1; + + mutex_lock(&wi_state_lock); + if (!input) { + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + if (!old_wi_state) + goto update_wi_state; + if (input == old_wi_state->mode_auto) { + mutex_unlock(&wi_state_lock); + return count; + } + + memcpy(new_wi_state->iw_table, old_wi_state->iw_table, + nr_node_ids * sizeof(u8)); + goto update_wi_state; + } + + bw = node_bw_table; + if (!bw) { + mutex_unlock(&wi_state_lock); + kfree(new_wi_state); + return -ENODEV; + } + + new_wi_state->mode_auto = true; + reduce_interleave_weights(bw, new_wi_state->iw_table); + +update_wi_state: + rcu_assign_pointer(wi_state, new_wi_state); + mutex_unlock(&wi_state_lock); + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } + return count; +} + +static void sysfs_wi_node_delete(int nid) +{ + struct iw_node_attr *attr; + + if (nid < 0 || nid >= nr_node_ids) + return; + + mutex_lock(&wi_group->kobj_lock); + attr = wi_group->nattrs[nid]; + if (!attr) { + mutex_unlock(&wi_group->kobj_lock); + return; + } + + wi_group->nattrs[nid] = NULL; + mutex_unlock(&wi_group->kobj_lock); + + sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr); + kfree(attr->kobj_attr.attr.name); + kfree(attr); +} + +static void sysfs_wi_node_delete_all(void) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) + sysfs_wi_node_delete(nid); +} + +static void wi_state_free(void) +{ + struct weighted_interleave_state *old_wi_state; + + mutex_lock(&wi_state_lock); + old_wi_state = rcu_dereference_protected(wi_state, + lockdep_is_held(&wi_state_lock)); + rcu_assign_pointer(wi_state, NULL); + mutex_unlock(&wi_state_lock); + + if (old_wi_state) { + synchronize_rcu(); + kfree(old_wi_state); + } +} + +static struct kobj_attribute wi_auto_attr = + __ATTR(auto, 0664, weighted_interleave_auto_show, + weighted_interleave_auto_store); + +static void wi_cleanup(void) { + sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + sysfs_wi_node_delete_all(); + wi_state_free(); +} + +static void wi_kobj_release(struct kobject *wi_kobj) +{ + kfree(wi_group); +} + +static const struct kobj_type wi_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = wi_kobj_release, +}; + +static int sysfs_wi_node_add(int nid) +{ + int ret; + char *name; + struct iw_node_attr *new_attr; + + if (nid < 0 || nid >= nr_node_ids) { + pr_err("invalid node id: %d\n", nid); + return -EINVAL; + } + + new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL); + if (!new_attr) + return -ENOMEM; + + name = kasprintf(GFP_KERNEL, "node%d", nid); + if (!name) { + kfree(new_attr); + return -ENOMEM; + } + + sysfs_attr_init(&new_attr->kobj_attr.attr); + new_attr->kobj_attr.attr.name = name; + new_attr->kobj_attr.attr.mode = 0644; + new_attr->kobj_attr.show = node_show; + new_attr->kobj_attr.store = node_store; + new_attr->nid = nid; + + mutex_lock(&wi_group->kobj_lock); + if (wi_group->nattrs[nid]) { + mutex_unlock(&wi_group->kobj_lock); + ret = -EEXIST; + goto out; + } + + ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr); + if (ret) { + mutex_unlock(&wi_group->kobj_lock); + goto out; + } + wi_group->nattrs[nid] = new_attr; + mutex_unlock(&wi_group->kobj_lock); + return 0; + +out: + kfree(new_attr->kobj_attr.attr.name); + kfree(new_attr); + return ret; +} + +static int wi_node_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err; + struct node_notify *nn = data; + int nid = nn->nid; + + switch (action) { + case NODE_ADDED_FIRST_MEMORY: + err = sysfs_wi_node_add(nid); + if (err) + pr_err("failed to add sysfs for node%d during hotplug: %d\n", + nid, err); + break; + case NODE_REMOVED_LAST_MEMORY: + sysfs_wi_node_delete(nid); + break; } - if (!nodes_empty(nodes)) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = ':'; - p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); + return NOTIFY_OK; +} + +static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj) +{ + int nid, err; + + wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids), + GFP_KERNEL); + if (!wi_group) + return -ENOMEM; + mutex_init(&wi_group->kobj_lock); + + err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj, + "weighted_interleave"); + if (err) + goto err_put_kobj; + + err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr); + if (err) + goto err_put_kobj; + + for_each_online_node(nid) { + if (!node_state(nid, N_MEMORY)) + continue; + + err = sysfs_wi_node_add(nid); + if (err) { + pr_err("failed to add sysfs for node%d during init: %d\n", + nid, err); + goto err_cleanup_kobj; + } } - return p - buffer; + + hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI); + return 0; + +err_cleanup_kobj: + wi_cleanup(); + kobject_del(&wi_group->wi_kobj); +err_put_kobj: + kobject_put(&wi_group->wi_kobj); + return err; } + +static int __init mempolicy_sysfs_init(void) +{ + int err; + static struct kobject *mempolicy_kobj; + + mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj); + if (!mempolicy_kobj) + return -ENOMEM; + + err = add_weighted_interleave_group(mempolicy_kobj); + if (err) + goto err_kobj; + + return 0; + +err_kobj: + kobject_del(mempolicy_kobj); + kobject_put(mempolicy_kobj); + return err; +} + +late_initcall(mempolicy_sysfs_init); +#endif /* CONFIG_SYSFS */ |
