summaryrefslogtreecommitdiff
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c3482
1 files changed, 2285 insertions, 1197 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 74310017296e..68a98ba57882 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1,14 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
- * Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
- * Support four policies per VMA and per process:
+ * Support six policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
@@ -19,18 +19,28 @@
* for anonymous memory. For process policy an process counter
* is used.
*
+ * weighted interleave
+ * Allocate memory interleaved over a set of nodes based on
+ * a set of weights (per-node), with normal fallback if it
+ * fails. Otherwise operates the same as interleave.
+ * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ * on node 0 for every 1 page allocated on node 1.
+ *
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
- * preferred Try a specific node first before normal fallback.
+ * preferred Try a specific node first before normal fallback.
* As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
*
+ * preferred many Try a set of nodes first before normal fallback. This is
+ * similar to preferred without the special case.
+ *
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
@@ -49,7 +59,7 @@
* on systems with highmem kernel lowmem allocation don't get policied.
* Same with GFP_DMA allocations.
*
- * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
+ * For shmem/tmpfs shared memory the policy is shared between
* all users and remembered even when nobody has memory mapped.
*/
@@ -65,12 +75,18 @@
kernel is not always grateful with that.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mempolicy.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
@@ -80,9 +96,11 @@
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
+#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/memory-tiers.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
@@ -91,16 +109,21 @@
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
+#include <linux/printk.h>
+#include <linux/leafops.h>
+#include <linux/gcd.h>
#include <asm/tlbflush.h>
-#include <asm/uaccess.h>
-#include <linux/random.h>
+#include <asm/tlb.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
-#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
@@ -114,56 +137,232 @@ enum zone_type policy_zone = 0;
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .mode = MPOL_PREFERRED,
- .flags = MPOL_F_LOCAL,
+ .mode = MPOL_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
-static struct mempolicy *get_task_policy(struct task_struct *p)
+/*
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
+ */
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
+ */
+struct weighted_interleave_state {
+ bool mode_auto;
+ u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
+ */
+static DEFINE_MUTEX(wi_state_lock);
+
+static u8 get_il_weight(int node)
+{
+ struct weighted_interleave_state *state;
+ u8 weight = 1;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ weight = state->iw_table[node];
+ rcu_read_unlock();
+ return weight;
+}
+
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+ u64 sum_bw = 0;
+ unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ sum_bw += bw[nid];
+
+ /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Try not to perform 64-bit division.
+ * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+ * If sum_bw > scaling_factor, then round the weight up to 1.
+ */
+ scaling_factor = weightiness * bw[nid];
+ if (bw[nid] && sum_bw < scaling_factor) {
+ cast_sum_bw = (unsigned int)sum_bw;
+ new_iw[nid] = scaling_factor / cast_sum_bw;
+ } else {
+ new_iw[nid] = 1;
+ }
+ if (!iw_gcd)
+ iw_gcd = new_iw[nid];
+ iw_gcd = gcd(iw_gcd, new_iw[nid]);
+ }
+
+ /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+ for_each_node_state(nid, N_MEMORY)
+ new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *old_bw, *new_bw;
+ unsigned int bw_val;
+ int i;
+
+ bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+ new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ if (!new_bw)
+ return -ENOMEM;
+
+ new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state) {
+ kfree(new_bw);
+ return -ENOMEM;
+ }
+ new_wi_state->mode_auto = true;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ /*
+ * Update bandwidth info, even in manual mode. That way, when switching
+ * to auto mode in the future, iw_table can be overwritten using
+ * accurate bw data.
+ */
+ mutex_lock(&wi_state_lock);
+
+ old_bw = node_bw_table;
+ if (old_bw)
+ memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+ new_bw[node] = bw_val;
+ node_bw_table = new_bw;
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state && !old_wi_state->mode_auto) {
+ /* Manual mode; skip reducing weights and updating wi_state */
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ goto out;
+ }
+
+ /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+ reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+ rcu_assign_pointer(wi_state, new_wi_state);
+
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+out:
+ kfree(old_bw);
+ return 0;
+}
+
+/**
+ * numa_nearest_node - Find nearest node by state
+ * @node: Node id to start the search
+ * @state: State to filter the search
+ *
+ * Lookup the closest node by distance if @nid is not in state.
+ *
+ * Return: this @node if it is in state, otherwise the closest node by distance
+ */
+int numa_nearest_node(int node, unsigned int state)
+{
+ int min_dist = INT_MAX, dist, n, min_node;
+
+ if (state >= NR_NODE_STATES)
+ return -EINVAL;
+
+ if (node == NUMA_NO_NODE || node_state(node, state))
+ return node;
+
+ min_node = node;
+ for_each_node_state(n, state) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(numa_nearest_node);
+
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
+struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
int node;
- if (!pol) {
- node = numa_node_id();
- if (node != NUMA_NO_NODE)
- pol = &preferred_node_policy[node];
+ if (pol)
+ return pol;
+ node = numa_node_id();
+ if (node != NUMA_NO_NODE) {
+ pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
- if (!pol->mode)
- pol = NULL;
+ if (pol->mode)
+ return pol;
}
- return pol;
+ return &default_policy;
}
+EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm");
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
- /*
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
- void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step);
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
-/* Check that the nodemask contains at least one populated zone */
-static int is_valid_nodemask(const nodemask_t *nodemask)
-{
- return nodes_intersects(*nodemask, node_states[N_MEMORY]);
-}
-
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
return pol->flags & MPOL_MODE_FLAGS;
@@ -177,74 +376,62 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
nodes_onto(*ret, tmp, *rel);
}
-static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+ pol->nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
- if (!nodes)
- pol->flags |= MPOL_F_LOCAL; /* local allocation */
- else if (nodes_empty(*nodes))
- return -EINVAL; /* no allowed nodes */
- else
- pol->v.preferred_node = first_node(*nodes);
- return 0;
-}
-
-static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
-{
- if (!is_valid_nodemask(nodes))
+ if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+
+ nodes_clear(pol->nodes);
+ node_set(first_node(*nodes), pol->nodes);
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
- * parameter with respect to the policy mode and flags. But, we need to
- * handle an empty nodemask with MPOL_PREFERRED here.
+ * parameter with respect to the policy mode and flags.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy. May also be called holding the mmap_semaphore for write.
+ * and mempolicy. May also be called holding the mmap_lock for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
int ret;
- /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
- if (pol == NULL)
+ /*
+ * Default (pol==NULL) resp. local memory policies are not a
+ * subject of any remapping. They also do not need any special
+ * constructor.
+ */
+ if (!pol || pol->mode == MPOL_LOCAL)
return 0;
+
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
- if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
- nodes = NULL; /* explicit local allocation */
- else {
- if (pol->flags & MPOL_F_RELATIVE_NODES)
- mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
- else
- nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (mpol_store_user_nodemask(pol))
- pol->w.user_nodemask = *nodes;
- else
- pol->w.cpuset_mems_allowed =
- cpuset_current_mems_allowed;
- }
+ if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
+ else
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (nodes)
- ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ if (mpol_store_user_nodemask(pol))
+ pol->w.user_nodemask = *nodes;
else
- ret = mpol_ops[pol->mode].create(pol, NULL);
+ pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
+
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
return ret;
}
@@ -257,9 +444,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
{
struct mempolicy *policy;
- pr_debug("setting mode %d flags %d nodes[0] %lx\n",
- mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
-
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
@@ -277,44 +461,42 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
+
+ mode = MPOL_LOCAL;
}
} else if (mode == MPOL_LOCAL) {
- if (!nodes_empty(*nodes))
+ if (!nodes_empty(*nodes) ||
+ (flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
- mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
+
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
+ policy->home_node = NUMA_NO_NODE;
return policy;
}
/* Slow path of a mpol destructor. */
-void __mpol_put(struct mempolicy *p)
+void __mpol_put(struct mempolicy *pol)
{
- if (!atomic_dec_and_test(&p->refcnt))
+ if (!atomic_dec_and_test(&pol->refcnt))
return;
- kmem_cache_free(policy_cache, p);
+ kmem_cache_free(policy_cache, pol);
}
+EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm");
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
-/*
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
-static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
- enum mpol_rebind_step step)
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
@@ -323,104 +505,39 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- /*
- * if step == 1, we use ->w.cpuset_mems_allowed to cache the
- * result
- */
- if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
- nodes_remap(tmp, pol->v.nodes,
- pol->w.cpuset_mems_allowed, *nodes);
- pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
- } else if (step == MPOL_REBIND_STEP2) {
- tmp = pol->w.cpuset_mems_allowed;
- pol->w.cpuset_mems_allowed = *nodes;
- } else
- BUG();
+ nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
tmp = *nodes;
- if (step == MPOL_REBIND_STEP1)
- nodes_or(pol->v.nodes, pol->v.nodes, tmp);
- else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
- pol->v.nodes = tmp;
- else
- BUG();
-
- if (!node_isset(current->il_next, tmp)) {
- current->il_next = next_node(current->il_next, tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = first_node(tmp);
- if (current->il_next >= MAX_NUMNODES)
- current->il_next = numa_node_id();
- }
+ pol->nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
- const nodemask_t *nodes,
- enum mpol_rebind_step step)
+ const nodemask_t *nodes)
{
- nodemask_t tmp;
-
- if (pol->flags & MPOL_F_STATIC_NODES) {
- int node = first_node(pol->w.user_nodemask);
-
- if (node_isset(node, *nodes)) {
- pol->v.preferred_node = node;
- pol->flags &= ~MPOL_F_LOCAL;
- } else
- pol->flags |= MPOL_F_LOCAL;
- } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
- mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
- pol->v.preferred_node = first_node(tmp);
- } else if (!(pol->flags & MPOL_F_LOCAL)) {
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- pol->w.cpuset_mems_allowed,
- *nodes);
- pol->w.cpuset_mems_allowed = *nodes;
- }
+ pol->w.cpuset_mems_allowed = *nodes;
}
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * MPOL_REBIND_ONCE - do rebind work at once
- * MPOL_REBIND_STEP1 - set all the newly nodes
- * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
+ * policies are protected by task->mems_allowed_seq to prevent a premature
+ * OOM/allocation failure due to parallel nodemask modification.
*/
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
- enum mpol_rebind_step step)
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
- if (!pol)
+ if (!pol || pol->mode == MPOL_LOCAL)
return;
- if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
+ if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
- if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
- return;
-
- if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
- BUG();
-
- if (step == MPOL_REBIND_STEP1)
- pol->flags |= MPOL_F_REBINDING;
- else if (step == MPOL_REBIND_STEP2)
- pol->flags &= ~MPOL_F_REBINDING;
- else if (step >= MPOL_REBIND_NSTEP)
- BUG();
-
- mpol_ops[pol->mode].rebind(pol, newmask, step);
+ mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
@@ -429,27 +546,27 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
*
* Called with task's alloc_lock held.
*/
-
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
- enum mpol_rebind_step step)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
- mpol_rebind_policy(tsk->mempolicy, new, step);
+ mpol_rebind_policy(tsk->mempolicy, new);
}
/*
* Rebind each vma in mm to new nodemask.
*
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
-
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- down_write(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ vma_start_write(vma);
+ mpol_rebind_policy(vma->vm_policy, new);
+ }
+ mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
@@ -457,7 +574,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
- .create = mpol_new_interleave,
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
@@ -465,115 +582,297 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
- .create = mpol_new_bind,
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_nodemask,
+ },
+ [MPOL_LOCAL] = {
+ .rebind = mpol_rebind_default,
+ },
+ [MPOL_PREFERRED_MANY] = {
+ .create = mpol_new_nodemask,
+ .rebind = mpol_rebind_preferred,
+ },
+ [MPOL_WEIGHTED_INTERLEAVE] = {
+ .create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
};
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags);
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
+ pgoff_t ilx, int *nid);
+
+static bool strictly_unmovable(unsigned long flags)
+{
+ /*
+ * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
+ * if any misplaced page is found.
+ */
+ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
+ MPOL_MF_STRICT;
+}
+
+struct migration_mpol { /* for alloc_migration_target_by_mpol() */
+ struct mempolicy *pol;
+ pgoff_t ilx;
+};
+
+struct queue_pages {
+ struct list_head *pagelist;
+ unsigned long flags;
+ nodemask_t *nmask;
+ unsigned long start;
+ unsigned long end;
+ struct vm_area_struct *first;
+ struct folio *large; /* note last large folio encountered */
+ long nr_failed; /* could not be isolated at this time */
+};
+
+/*
+ * Check if the folio's nid is in qp->nmask.
+ *
+ * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
+ * in the invert of qp->nmask.
+ */
+static inline bool queue_folio_required(struct folio *folio,
+ struct queue_pages *qp)
+{
+ int nid = folio_nid(folio);
+ unsigned long flags = qp->flags;
+
+ return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
+}
-/* Scan through pages checking if pages follow certain conditions. */
-static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
+static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
- pte_t *orig_pte;
- pte_t *pte;
+ struct folio *folio;
+ struct queue_pages *qp = walk->private;
+
+ if (unlikely(pmd_is_migration_entry(*pmd))) {
+ qp->nr_failed++;
+ return;
+ }
+ folio = pmd_folio(*pmd);
+ if (is_huge_zero_folio(folio)) {
+ walk->action = ACTION_CONTINUE;
+ return;
+ }
+ if (!queue_folio_required(folio, qp))
+ return;
+ if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
+ !vma_migratable(walk->vma) ||
+ !migrate_folio_add(folio, qp->pagelist, qp->flags))
+ qp->nr_failed++;
+}
+
+/*
+ * Scan through folios, checking if they satisfy the required conditions,
+ * moving them from LRU to local pagelist for migration if they do (or not).
+ *
+ * queue_folios_pte_range() has two possible return values:
+ * 0 - continue walking to scan for more, even if an existing folio on the
+ * wrong node could not be isolated and queued for migration.
+ * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
+ * and an existing folio was on a node that does not follow the policy.
+ */
+static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct folio *folio;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+ pte_t *pte, *mapped_pte;
+ pte_t ptent;
spinlock_t *ptl;
+ int max_nr, nr;
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
- struct page *page;
- int nid;
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ queue_folios_pmd(pmd, walk);
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+ max_nr = (end - addr) >> PAGE_SHIFT;
+ nr = 1;
+ ptent = ptep_get(pte);
+ if (pte_none(ptent))
+ continue;
+ if (!pte_present(ptent)) {
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (!pte_present(*pte))
+ if (softleaf_is_migration(entry))
+ qp->nr_failed++;
continue;
- page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ }
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_test_large(folio) && max_nr != 1)
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
/*
- * vm_normal_page() filters out zero pages, but there might
- * still be PageReserved pages to skip, perhaps in a VDSO.
+ * vm_normal_folio() filters out zero pages, but there might
+ * still be reserved folios to skip, perhaps in a VDSO.
*/
- if (PageReserved(page))
+ if (folio_test_reserved(folio))
continue;
- nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ if (!queue_folio_required(folio, qp))
continue;
-
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, private, flags);
- else
- break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(orig_pte, ptl);
- return addr != end;
+ if (folio_test_large(folio)) {
+ /*
+ * A large folio can only be isolated from LRU once,
+ * but may be mapped by many PTEs (and Copy-On-Write may
+ * intersperse PTEs of other, order 0, folios). This is
+ * a common case, so don't mistake it for failure (but
+ * there can be other cases of multi-mapped pages which
+ * this quick check does not help to filter out - and a
+ * search of the pagelist might grow to be prohibitive).
+ *
+ * migrate_pages(&pagelist) returns nr_failed folios, so
+ * check "large" now so that queue_pages_range() returns
+ * a comparable nr_failed folios. This does imply that
+ * if folio could not be isolated for some racy reason
+ * at its first PTE, later PTEs will not give it another
+ * chance of isolation; but keeps the accounting simple.
+ */
+ if (folio == qp->large)
+ continue;
+ qp->large = folio;
+ }
+ if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
+ !vma_migratable(vma) ||
+ !migrate_folio_add(folio, qp->pagelist, flags)) {
+ qp->nr_failed += nr;
+ if (strictly_unmovable(flags))
+ break;
+ }
+ }
+ pte_unmap_unlock(mapped_pte, ptl);
+ cond_resched();
+out:
+ if (qp->nr_failed && strictly_unmovable(flags))
+ return -EIO;
+ return 0;
}
-static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
- pmd_t *pmd;
- unsigned long next;
+#ifdef CONFIG_HUGETLB_PAGE
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+ struct folio *folio;
+ spinlock_t *ptl;
+ pte_t ptep;
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
- if (check_pte_range(vma, pmd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pmd++, addr = next, addr != end);
- return 0;
-}
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ ptep = huge_ptep_get(walk->mm, addr, pte);
+ if (!pte_present(ptep)) {
+ if (!huge_pte_none(ptep)) {
+ const softleaf_t entry = softleaf_from_pte(ptep);
-static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pud_t *pud;
- unsigned long next;
+ if (unlikely(softleaf_is_migration(entry)))
+ qp->nr_failed++;
+ }
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
- continue;
- if (check_pmd_range(vma, pud, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pud++, addr = next, addr != end);
+ goto unlock;
+ }
+ folio = pfn_folio(pte_pfn(ptep));
+ if (!queue_folio_required(folio, qp))
+ goto unlock;
+ if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
+ !vma_migratable(walk->vma)) {
+ qp->nr_failed++;
+ goto unlock;
+ }
+ /*
+ * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
+ * Choosing not to migrate a shared folio is not counted as a failure.
+ *
+ * See folio_maybe_mapped_shared() on possible imprecision when we
+ * cannot easily detect if a folio is shared.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) ||
+ (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
+ if (!folio_isolate_hugetlb(folio, qp->pagelist))
+ qp->nr_failed++;
+unlock:
+ spin_unlock(ptl);
+ if (qp->nr_failed && strictly_unmovable(flags))
+ return -EIO;
+#endif
return 0;
}
-static inline int check_pgd_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
+#ifdef CONFIG_NUMA_BALANCING
+/**
+ * folio_can_map_prot_numa() - check whether the folio can map prot numa
+ * @folio: The folio whose mapping considered for being made NUMA hintable
+ * @vma: The VMA that the folio belongs to.
+ * @is_private_single_threaded: Is this a single-threaded private VMA or not
+ *
+ * This function checks to see if the folio actually indicates that
+ * we need to make the mapping one which causes a NUMA hinting fault,
+ * as there are cases where it's simply unnecessary, and the folio's
+ * access time is adjusted for memory tiering if prot numa needed.
+ *
+ * Return: True if the mapping of the folio needs to be changed, false otherwise.
+ */
+bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
+ bool is_private_single_threaded)
{
- pgd_t *pgd;
- unsigned long next;
+ int nid;
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- if (check_pud_range(vma, pgd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pgd++, addr = next, addr != end);
- return 0;
+ if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
+ return false;
+
+ /* Also skip shared copy-on-write folios */
+ if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
+ return false;
+
+ /* Folios are pinned and can't be migrated */
+ if (folio_maybe_dma_pinned(folio))
+ return false;
+
+ /*
+ * While migration can move some dirty folios,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (folio_is_file_lru(folio) && folio_test_dirty(folio))
+ return false;
+
+ /*
+ * Don't mess with PTEs if folio is already on the node
+ * a single-threaded process is running on.
+ */
+ nid = folio_nid(folio);
+ if (is_private_single_threaded && (nid == numa_node_id()))
+ return false;
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ node_is_toptier(nid))
+ return false;
+
+ if (folio_use_access_time(folio))
+ folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
+
+ return true;
}
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
@@ -586,95 +885,130 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
- int nr_updated;
- BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+ struct mmu_gather tlb;
+ long nr_updated;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
- nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
- if (nr_updated)
+ nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
+ if (nr_updated > 0) {
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+ count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
+ }
+
+ tlb_finish_mmu(&tlb);
return nr_updated;
}
-#else
-static unsigned long change_prot_numa(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+#endif /* CONFIG_NUMA_BALANCING */
+
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
{
- return 0;
+ struct vm_area_struct *next, *vma = walk->vma;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+
+ /* range check first */
+ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
+
+ if (!qp->first) {
+ qp->first = vma;
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ (qp->start < vma->vm_start))
+ /* hole at head side of range */
+ return -EFAULT;
+ }
+ next = find_vma(vma->vm_mm, vma->vm_end);
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ ((vma->vm_end < qp->end) &&
+ (!next || vma->vm_end < next->vm_start)))
+ /* hole at middle or tail of range */
+ return -EFAULT;
+
+ /*
+ * Need check MPOL_MF_STRICT to return -EIO if possible
+ * regardless of vma_migratable
+ */
+ if (!vma_migratable(vma) &&
+ !(flags & MPOL_MF_STRICT))
+ return 1;
+
+ /*
+ * Check page nodes, and queue pages to move, in the current vma.
+ * But if no moving, and no strict checking, the scan can be skipped.
+ */
+ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ return 0;
+ return 1;
}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
+static const struct mm_walk_ops queue_pages_walk_ops = {
+ .hugetlb_entry = queue_folios_hugetlb,
+ .pmd_entry = queue_folios_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
+ .hugetlb_entry = queue_folios_hugetlb,
+ .pmd_entry = queue_folios_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
+};
/*
- * Check if all pages in a range are on a set of nodes.
- * If pagelist != NULL then isolate pages from the LRU and
- * put them on the pagelist.
+ * Walk through page tables and collect pages to be migrated.
+ *
+ * If pages found in a given range are not on the required set of @nodes,
+ * and migration is allowed, they are isolated and queued to @pagelist.
+ *
+ * queue_pages_range() may return:
+ * 0 - all pages already on the right node, or successfully queued for moving
+ * (or neither strict checking nor moving requested: only range checking).
+ * >0 - this number of misplaced folios could not be queued for moving
+ * (a hugetlbfs page or a transparent huge page being counted as 1).
+ * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
+ * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
*/
-static struct vm_area_struct *
-check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- const nodemask_t *nodes, unsigned long flags, void *private)
+static long
+queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+ nodemask_t *nodes, unsigned long flags,
+ struct list_head *pagelist)
{
int err;
- struct vm_area_struct *first, *vma, *prev;
-
-
- first = find_vma(mm, start);
- if (!first)
- return ERR_PTR(-EFAULT);
- prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
- unsigned long endvma = vma->vm_end;
-
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
- if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
- }
-
- if (is_vm_hugetlb_page(vma))
- goto next;
+ struct queue_pages qp = {
+ .pagelist = pagelist,
+ .flags = flags,
+ .nmask = nodes,
+ .start = start,
+ .end = end,
+ .first = NULL,
+ };
+ const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
+ &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
- if (flags & MPOL_MF_LAZY) {
- change_prot_numa(vma, start, endvma);
- goto next;
- }
+ err = walk_page_range(mm, start, end, ops, &qp);
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma))) {
+ if (!qp.first)
+ /* whole range in hole */
+ err = -EFAULT;
- err = check_pgd_range(vma, start, endvma, nodes,
- flags, private);
- if (err) {
- first = ERR_PTR(err);
- break;
- }
- }
-next:
- prev = vma;
- }
- return first;
+ return err ? : qp.nr_failed;
}
/*
* Apply policy to a single VMA
- * This must be called with the mmap_sem held for writing.
+ * This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
- struct mempolicy *pol)
+ struct mempolicy *pol)
{
int err;
struct mempolicy *old;
struct mempolicy *new;
- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
- vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_ops, vma->vm_file,
- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+ vma_assert_write_locked(vma);
new = mpol_dup(pol);
if (IS_ERR(new))
@@ -687,7 +1021,7 @@ static int vma_replace_policy(struct vm_area_struct *vma,
}
old = vma->vm_policy;
- vma->vm_policy = new; /* protected by mmap_sem */
+ vma->vm_policy = new; /* protected by mmap_lock */
mpol_put(old);
return 0;
@@ -696,91 +1030,32 @@ static int vma_replace_policy(struct vm_area_struct *vma,
return err;
}
-/* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct mempolicy *new_pol)
+/* Split or merge the VMA (if required) and apply the new policy */
+static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
{
- struct vm_area_struct *next;
- struct vm_area_struct *prev;
- struct vm_area_struct *vma;
- int err = 0;
- pgoff_t pgoff;
- unsigned long vmstart;
- unsigned long vmend;
-
- vma = find_vma(mm, start);
- if (!vma || vma->vm_start > start)
- return -EFAULT;
-
- prev = vma->vm_prev;
- if (start > vma->vm_start)
- prev = vma;
+ unsigned long vmstart, vmend;
- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
- next = vma->vm_next;
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
-
- if (mpol_equal(vma_policy(vma), new_pol))
- continue;
-
- pgoff = vma->vm_pgoff +
- ((vmstart - vma->vm_start) >> PAGE_SHIFT);
- prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- new_pol);
- if (prev) {
- vma = prev;
- next = vma->vm_next;
- continue;
- }
- if (vma->vm_start != vmstart) {
- err = split_vma(vma->vm_mm, vma, vmstart, 1);
- if (err)
- goto out;
- }
- if (vma->vm_end != vmend) {
- err = split_vma(vma->vm_mm, vma, vmend, 0);
- if (err)
- goto out;
- }
- err = vma_replace_policy(vma, new_pol);
- if (err)
- goto out;
+ vmend = min(end, vma->vm_end);
+ if (start > vma->vm_start) {
+ *prev = vma;
+ vmstart = start;
+ } else {
+ vmstart = vma->vm_start;
}
- out:
- return err;
-}
-
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy. Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
+ if (mpol_equal(vma->vm_policy, new_pol)) {
+ *prev = vma;
+ return 0;
+ }
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
- if (p->mempolicy)
- p->flags |= PF_MEMPOLICY;
- else
- p->flags &= ~PF_MEMPOLICY;
-}
+ vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
-static void mpol_set_task_struct_flag(void)
-{
- mpol_fix_fork_child_flag(current);
+ *prev = vma;
+ return vma_replace_policy(vma, new_pol);
}
/* Set the process memory policy */
@@ -788,7 +1063,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
- struct mm_struct *mm = current->mm;
NODEMASK_SCRATCH(scratch);
int ret;
@@ -800,33 +1074,23 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
ret = PTR_ERR(new);
goto out;
}
- /*
- * prevent changing our mempolicy while show_numa_maps()
- * is using it.
- * Note: do_set_mempolicy() can be called at init time
- * with no 'mm'.
- */
- if (mm)
- down_write(&mm->mmap_sem);
+
task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
- if (mm)
- up_write(&mm->mmap_sem);
mpol_put(new);
goto out;
}
+
old = current->mempolicy;
current->mempolicy = new;
- mpol_set_task_struct_flag();
- if (new && new->mode == MPOL_INTERLEAVE &&
- nodes_weight(new->v.nodes))
- current->il_next = first_node(new->v.nodes);
+ if (new && (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
+ current->il_prev = MAX_NUMNODES-1;
+ current->il_weight = 0;
+ }
task_unlock(current);
- if (mm)
- up_write(&mm->mmap_sem);
-
mpol_put(old);
ret = 0;
out:
@@ -839,22 +1103,22 @@ out:
*
* Called with task's alloc_lock held
*/
-static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
+static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
nodes_clear(*nodes);
- if (p == &default_policy)
+ if (pol == &default_policy)
return;
- switch (p->mode) {
+ switch (pol->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
- *nodes = p->v.nodes;
- break;
case MPOL_PREFERRED:
- if (!(p->flags & MPOL_F_LOCAL))
- node_set(p->v.preferred_node, *nodes);
- /* else return empty node mask for local allocation */
+ case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *nodes = pol->nodes;
+ break;
+ case MPOL_LOCAL:
+ /* return empty node mask for local allocation */
break;
default:
BUG();
@@ -863,15 +1127,15 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
- struct page *p;
- int err;
+ struct page *p = NULL;
+ int ret;
- err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
- if (err >= 0) {
- err = page_to_nid(p);
+ ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
+ if (ret > 0) {
+ ret = page_to_nid(p);
put_page(p);
}
- return err;
+ return ret;
}
/* Retrieve NUMA policy */
@@ -881,7 +1145,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -898,21 +1162,19 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
if (flags & MPOL_F_ADDR) {
+ pgoff_t ilx; /* ignored here */
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
- down_read(&mm->mmap_sem);
- vma = find_vma_intersection(mm, addr, addr+1);
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
if (!vma) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return -EFAULT;
}
- if (vma->vm_ops && vma->vm_ops->get_policy)
- pol = vma->vm_ops->get_policy(vma, addr);
- else
- pol = vma->vm_policy;
+ pol = __get_vma_policy(vma, addr, &ilx);
} else if (addr)
return -EINVAL;
@@ -921,13 +1183,29 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
+ /*
+ * Take a refcount on the mpol, because we are about to
+ * drop the mmap_lock, after which only "pol" remains
+ * valid, "vma" is stale.
+ */
+ pol_refcount = pol;
+ vma = NULL;
+ mpol_get(pol);
+ mmap_read_unlock(mm);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
- *policy = current->il_next;
+ *policy = next_node_in(current->il_prev, pol->nodes);
+ } else if (pol == current->mempolicy &&
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ if (current->il_weight)
+ *policy = current->il_prev;
+ else
+ *policy = next_node_in(current->il_prev,
+ pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -942,11 +1220,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
- if (vma) {
- up_read(&current->mm->mmap_sem);
- vma = NULL;
- }
-
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
@@ -961,64 +1234,91 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
out:
mpol_cond_put(pol);
if (vma)
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(mm);
+ if (pol_refcount)
+ mpol_put(pol_refcount);
return err;
}
#ifdef CONFIG_MIGRATION
-/*
- * page migration
- */
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
/*
- * Avoid migrating a page that is shared with others.
+ * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
+ * Choosing not to migrate a shared folio is not counted as a failure.
+ *
+ * See folio_maybe_mapped_shared() on possible imprecision when we
+ * cannot easily detect if a folio is shared.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
- if (!isolate_lru_page(page)) {
- list_add_tail(&page->lru, pagelist);
- inc_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
+ if (folio_isolate_lru(folio)) {
+ list_add_tail(&folio->lru, foliolist);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
+ } else {
+ /*
+ * Non-movable folio may reach here. And, there may be
+ * temporary off LRU folios or non-LRU movable folios.
+ * Treat them as unmovable folios since they can't be
+ * isolated, so they can't be moved at the moment.
+ */
+ return false;
}
}
-}
-
-static struct page *new_node_page(struct page *page, unsigned long node, int **x)
-{
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return true;
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
-static int migrate_to_node(struct mm_struct *mm, int source, int dest,
- int flags)
+static long migrate_to_node(struct mm_struct *mm, int source, int dest,
+ int flags)
{
nodemask_t nmask;
+ struct vm_area_struct *vma;
LIST_HEAD(pagelist);
- int err = 0;
+ long nr_failed;
+ long err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ .reason = MR_SYSCALL,
+ };
nodes_clear(nmask);
node_set(source, nmask);
+ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+
+ mmap_read_lock(mm);
+ vma = find_vma(mm, 0);
+ if (unlikely(!vma)) {
+ mmap_read_unlock(mm);
+ return 0;
+ }
+
/*
- * This does not "check" the range but isolates all pages that
+ * This does not migrate the range, but isolates all pages that
* need migration. Between passing in the full user address
- * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+ * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
+ * but passes back the count of pages which could not be isolated.
*/
- VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
- flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ mmap_read_unlock(mm);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
- putback_lru_pages(&pagelist);
+ putback_movable_pages(&pagelist);
}
+ if (err >= 0)
+ err += nr_failed;
return err;
}
@@ -1031,19 +1331,11 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
- int busy = 0;
- int err;
+ long nr_failed = 0;
+ long err = 0;
nodemask_t tmp;
- err = migrate_prep();
- if (err)
- return err;
-
- down_read(&mm->mmap_sem);
-
- err = migrate_vmas(mm, from, to, flags);
- if (err)
- goto out;
+ lru_cache_disable();
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
@@ -1078,8 +1370,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
- int s,d;
- int source = -1;
+ int s, d;
+ int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
@@ -1114,53 +1406,63 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (!node_isset(dest, tmp))
break;
}
- if (source == -1)
+ if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
- busy += err;
+ nr_failed += err;
if (err < 0)
break;
}
-out:
- up_read(&mm->mmap_sem);
+
+ lru_cache_enable();
if (err < 0)
return err;
- return busy;
-
+ return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}
/*
- * Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
- * Search forward from there, if not. N.B., this assumes that the
- * list of pages handed to migrate_pages()--which is how we get here--
- * is in virtual address order.
+ * Allocate a new folio for page migration, according to NUMA mempolicy.
*/
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct folio *alloc_migration_target_by_mpol(struct folio *src,
+ unsigned long private)
{
- struct vm_area_struct *vma = (struct vm_area_struct *)private;
- unsigned long uninitialized_var(address);
+ struct migration_mpol *mmpol = (struct migration_mpol *)private;
+ struct mempolicy *pol = mmpol->pol;
+ pgoff_t ilx = mmpol->ilx;
+ unsigned int order;
+ int nid = numa_node_id();
+ gfp_t gfp;
- while (vma) {
- address = page_address_in_vma(page, vma);
- if (address != -EFAULT)
- break;
- vma = vma->vm_next;
+ order = folio_order(src);
+ ilx += src->index >> order;
+
+ if (folio_test_hugetlb(src)) {
+ nodemask_t *nodemask;
+ struct hstate *h;
+
+ h = folio_hstate(src);
+ gfp = htlb_alloc_mask(h);
+ nodemask = policy_nodemask(gfp, pol, ilx, &nid);
+ return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
+ htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
}
- /*
- * if !vma, alloc_page_vma() will use task or system default policy
- */
- return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (folio_test_large(src))
+ gfp = GFP_TRANSHUGE;
+ else
+ gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
+
+ return folio_alloc_mpol(gfp, order, pol, ilx, nid);
}
#else
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
+ return false;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
@@ -1169,7 +1471,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct folio *alloc_migration_target_by_mpol(struct folio *src,
+ unsigned long private)
{
return NULL;
}
@@ -1179,11 +1482,14 @@ static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
- struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct vma_iterator vmi;
+ struct migration_mpol mmpol;
struct mempolicy *new;
unsigned long end;
- int err;
+ long err;
+ long nr_failed;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
@@ -1197,7 +1503,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
@@ -1209,9 +1515,6 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new))
return PTR_ERR(new);
- if (flags & MPOL_MF_LAZY)
- new->flags |= MPOL_F_MOF;
-
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -1219,25 +1522,15 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
- pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
- start, start + len, mode, mode_flags,
- nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
-
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-
- err = migrate_prep();
- if (err)
- goto mpol_out;
- }
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_cache_disable();
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
- down_write(&mm->mmap_sem);
- task_lock(current);
+ mmap_write_lock(mm);
err = mpol_set_nodemask(new, nmask, scratch);
- task_unlock(current);
if (err)
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
@@ -1245,48 +1538,121 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
- vma = check_range(mm, start, end, nmask,
- flags | MPOL_MF_INVERT, &pagelist);
+ /*
+ * Lock the VMAs before scanning for pages to migrate,
+ * to ensure we don't miss a concurrently inserted page.
+ */
+ nr_failed = queue_pages_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
- err = PTR_ERR(vma); /* maybe ... */
- if (!IS_ERR(vma))
- err = mbind_range(mm, start, end, new);
+ if (nr_failed < 0) {
+ err = nr_failed;
+ nr_failed = 0;
+ } else {
+ vma_iter_init(&vmi, mm, start);
+ prev = vma_prev(&vmi);
+ for_each_vma_range(vmi, vma, end) {
+ err = mbind_range(&vmi, vma, &prev, start, end, new);
+ if (err)
+ break;
+ }
+ }
- if (!err) {
- int nr_failed = 0;
+ if (!err && !list_empty(&pagelist)) {
+ /* Convert MPOL_DEFAULT's NULL to task or default policy */
+ if (!new) {
+ new = get_task_policy(current);
+ mpol_get(new);
+ }
+ mmpol.pol = new;
+ mmpol.ilx = 0;
- if (!list_empty(&pagelist)) {
- WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma,
- MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
- if (nr_failed)
- putback_lru_pages(&pagelist);
+ /*
+ * In the interleaved case, attempt to allocate on exactly the
+ * targeted nodes, for the first VMA to be migrated; for later
+ * VMAs, the nodes will still be interleaved from the targeted
+ * nodemask, but one by one may be selected differently.
+ */
+ if (new->mode == MPOL_INTERLEAVE ||
+ new->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ struct folio *folio;
+ unsigned int order;
+ unsigned long addr = -EFAULT;
+
+ list_for_each_entry(folio, &pagelist, lru) {
+ if (!folio_test_ksm(folio))
+ break;
+ }
+ if (!list_entry_is_head(folio, &pagelist, lru)) {
+ vma_iter_init(&vmi, mm, start);
+ for_each_vma_range(vmi, vma, end) {
+ addr = page_address_in_vma(folio,
+ folio_page(folio, 0), vma);
+ if (addr != -EFAULT)
+ break;
+ }
+ }
+ if (addr != -EFAULT) {
+ order = folio_order(folio);
+ /* We already know the pol, but not the ilx */
+ mpol_cond_put(get_vma_policy(vma, addr, order,
+ &mmpol.ilx));
+ /* Set base from which to increment by index */
+ mmpol.ilx -= folio->index >> order;
+ }
}
+ }
- if (nr_failed && (flags & MPOL_MF_STRICT))
- err = -EIO;
- } else
- putback_lru_pages(&pagelist);
+ mmap_write_unlock(mm);
+
+ if (!err && !list_empty(&pagelist)) {
+ nr_failed |= migrate_pages(&pagelist,
+ alloc_migration_target_by_mpol, NULL,
+ (unsigned long)&mmpol, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND, NULL);
+ }
- up_write(&mm->mmap_sem);
- mpol_out:
+ if (nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
+mpol_out:
mpol_put(new);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_cache_enable();
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
+static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long nlongs = BITS_TO_LONGS(maxnode);
+ int ret;
+
+ if (in_compat_syscall())
+ ret = compat_get_bitmap(mask,
+ (const compat_ulong_t __user *)nmask,
+ maxnode);
+ else
+ ret = copy_from_user(mask, nmask,
+ nlongs * sizeof(unsigned long));
+
+ if (ret)
+ return -EFAULT;
+
+ if (maxnode % BITS_PER_LONG)
+ mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ return 0;
+}
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
- unsigned long k;
- unsigned long nlongs;
- unsigned long endmask;
-
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
@@ -1294,35 +1660,29 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
return -EINVAL;
- nlongs = BITS_TO_LONGS(maxnode);
- if ((maxnode % BITS_PER_LONG) == 0)
- endmask = ~0UL;
- else
- endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+ /*
+ * When the user specified more nodes than supported just check
+ * if the non supported part is all zero, one word at a time,
+ * starting at the end.
+ */
+ while (maxnode > MAX_NUMNODES) {
+ unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
+ unsigned long t;
- /* When the user specified more nodes than supported just check
- if the non supported part is all zero. */
- if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
- if (nlongs > PAGE_SIZE/sizeof(long))
- return -EINVAL;
- for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- unsigned long t;
- if (get_user(t, nmask + k))
- return -EFAULT;
- if (k == nlongs - 1) {
- if (t & endmask)
- return -EINVAL;
- } else if (t)
- return -EINVAL;
+ if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
+ return -EFAULT;
+
+ if (maxnode - bits >= MAX_NUMNODES) {
+ maxnode -= bits;
+ } else {
+ maxnode = MAX_NUMNODES;
+ t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
}
- nlongs = BITS_TO_LONGS(MAX_NUMNODES);
- endmask = ~0UL;
+ if (t)
+ return -EINVAL;
}
- if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
- return -EFAULT;
- nodes_addr(*nodes)[nlongs-1] &= endmask;
- return 0;
+ return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}
/* Copy a kernel node mask to user space */
@@ -1330,7 +1690,11 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
+ bool compat = in_compat_syscall();
+
+ if (compat)
+ nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1338,56 +1702,160 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
+ maxnode = nr_node_ids;
}
+
+ if (compat)
+ return compat_put_bitmap((compat_ulong_t __user *)mask,
+ nodes_addr(*nodes), maxnode);
+
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
-SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
- unsigned long, mode, unsigned long __user *, nmask,
- unsigned long, maxnode, unsigned, flags)
+/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
+static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
- nodemask_t nodes;
- int err;
- unsigned short mode_flags;
+ *flags = *mode & MPOL_MODE_FLAGS;
+ *mode &= ~MPOL_MODE_FLAGS;
- mode_flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if (mode >= MPOL_MAX)
+ if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
- if ((mode_flags & MPOL_F_STATIC_NODES) &&
- (mode_flags & MPOL_F_RELATIVE_NODES))
+ if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
+ if (*flags & MPOL_F_NUMA_BALANCING) {
+ if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
+ *flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ else
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static long kernel_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, const unsigned long __user *nmask,
+ unsigned long maxnode, unsigned int flags)
+{
+ unsigned short mode_flags;
+ nodemask_t nodes;
+ int lmode = mode;
+ int err;
+
+ start = untagged_addr(start);
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
+
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, mode_flags, &nodes, flags);
+
+ return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
-/* Set the process memory policy */
-SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
- unsigned long, maxnode)
+SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
+ unsigned long, home_node, unsigned long, flags)
{
- int err;
- nodemask_t nodes;
- unsigned short flags;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ struct mempolicy *new, *old;
+ unsigned long end;
+ int err = -ENOENT;
+ VMA_ITERATOR(vmi, mm, start);
+
+ start = untagged_addr(start);
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ /*
+ * flags is used for future extension if any.
+ */
+ if (flags != 0)
+ return -EINVAL;
- flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)mode >= MPOL_MAX)
+ /*
+ * Check home_node is online to avoid accessing uninitialized
+ * NODE_DATA.
+ */
+ if (home_node >= MAX_NUMNODES || !node_online(home_node))
return -EINVAL;
- if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
+
+ len = PAGE_ALIGN(len);
+ end = start + len;
+
+ if (end < start)
return -EINVAL;
+ if (end == start)
+ return 0;
+ mmap_write_lock(mm);
+ prev = vma_prev(&vmi);
+ for_each_vma_range(vmi, vma, end) {
+ /*
+ * If any vma in the range got policy other than MPOL_BIND
+ * or MPOL_PREFERRED_MANY we return error. We don't reset
+ * the home node for vmas we already updated before.
+ */
+ old = vma_policy(vma);
+ if (!old) {
+ prev = vma;
+ continue;
+ }
+ if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ new = mpol_dup(old);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ break;
+ }
+
+ vma_start_write(vma);
+ new->home_node = home_node;
+ err = mbind_range(&vmi, vma, &prev, start, end, new);
+ mpol_put(new);
+ if (err)
+ break;
+ }
+ mmap_write_unlock(mm);
+ return err;
+}
+
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
+ unsigned long, mode, const unsigned long __user *, nmask,
+ unsigned long, maxnode, unsigned int, flags)
+{
+ return kernel_mbind(start, len, mode, nmask, maxnode, flags);
+}
+
+/* Set the process memory policy */
+static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned short mode_flags;
+ nodemask_t nodes;
+ int lmode = mode;
+ int err;
+
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
+
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, flags, &nodes);
+
+ return do_set_mempolicy(lmode, mode_flags, &nodes);
}
-SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
- const unsigned long __user *, old_nodes,
- const unsigned long __user *, new_nodes)
+SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
+ unsigned long, maxnode)
+{
+ return kernel_set_mempolicy(mode, nmask, maxnode);
+}
+
+static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
+ const unsigned long __user *old_nodes,
+ const unsigned long __user *new_nodes)
{
- const struct cred *cred = current_cred(), *tcred;
struct mm_struct *mm = NULL;
struct task_struct *task;
nodemask_t task_nodes;
@@ -1423,15 +1891,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
err = -EINVAL;
/*
- * Check if this process has the right to modify the specified
- * process. The right exists if the process has administrative
- * capabilities, superuser privileges or the same
- * userid as the target process.
+ * Check if this process has the right to modify the specified process.
+ * Use the regular "ptrace_may_access()" checks.
*/
- tcred = __task_cred(task);
- if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
- !capable(CAP_SYS_NICE)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out_put;
@@ -1445,10 +1908,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out_put;
}
- if (!nodes_subset(*new, node_states[N_MEMORY])) {
- err = -EINVAL;
+ task_nodes = cpuset_mems_allowed(current);
+ nodes_and(*new, *new, task_nodes);
+ if (nodes_empty(*new))
goto out_put;
- }
err = security_task_movememory(task);
if (err)
@@ -1474,22 +1937,31 @@ out:
out_put:
put_task_struct(task);
goto out;
-
}
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+ const unsigned long __user *, old_nodes,
+ const unsigned long __user *, new_nodes)
+{
+ return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
+}
/* Retrieve NUMA policy */
-SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
- unsigned long __user *, nmask, unsigned long, maxnode,
- unsigned long, addr, unsigned long, flags)
+static int kernel_get_mempolicy(int __user *policy,
+ unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned long addr,
+ unsigned long flags)
{
int err;
- int uninitialized_var(pval);
+ int pval;
nodemask_t nodes;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
+ if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
+ addr = untagged_addr(addr);
+
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
@@ -1504,347 +1976,360 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
return err;
}
-#ifdef CONFIG_COMPAT
-
-asmlinkage long compat_sys_get_mempolicy(int __user *policy,
- compat_ulong_t __user *nmask,
- compat_ulong_t maxnode,
- compat_ulong_t addr, compat_ulong_t flags)
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ unsigned long __user *, nmask, unsigned long, maxnode,
+ unsigned long, addr, unsigned long, flags)
{
- long err;
- unsigned long __user *nm = NULL;
- unsigned long nr_bits, alloc_size;
- DECLARE_BITMAP(bm, MAX_NUMNODES);
-
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
- alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
-
- if (nmask)
- nm = compat_alloc_user_space(alloc_size);
-
- err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
-
- if (!err && nmask) {
- unsigned long copy_size;
- copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
- err = copy_from_user(bm, nm, copy_size);
- /* ensure entire bitmap is zeroed */
- err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
- err |= compat_put_bitmap(nmask, bm, nr_bits);
- }
-
- return err;
+ return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}
-asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
- compat_ulong_t maxnode)
+bool vma_migratable(struct vm_area_struct *vma)
{
- long err = 0;
- unsigned long __user *nm = NULL;
- unsigned long nr_bits, alloc_size;
- DECLARE_BITMAP(bm, MAX_NUMNODES);
-
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
- alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ return false;
- if (nmask) {
- err = compat_get_bitmap(bm, nmask, nr_bits);
- nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, bm, alloc_size);
- }
+ /*
+ * DAX device mappings require predictable access latency, so avoid
+ * incurring periodic faults.
+ */
+ if (vma_is_dax(vma))
+ return false;
- if (err)
- return -EFAULT;
+ if (is_vm_hugetlb_page(vma) &&
+ !hugepage_migration_supported(hstate_vma(vma)))
+ return false;
- return sys_set_mempolicy(mode, nm, nr_bits+1);
+ /*
+ * Migration allocates pages in the highest zone. If we cannot
+ * do so then migration (at least from node to node) is not
+ * possible.
+ */
+ if (vma->vm_file &&
+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
+ < policy_zone)
+ return false;
+ return true;
}
-asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
- compat_ulong_t mode, compat_ulong_t __user *nmask,
- compat_ulong_t maxnode, compat_ulong_t flags)
+struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *ilx)
{
- long err = 0;
- unsigned long __user *nm = NULL;
- unsigned long nr_bits, alloc_size;
- nodemask_t bm;
-
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
- alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
-
- if (nmask) {
- err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
- nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
- }
-
- if (err)
- return -EFAULT;
-
- return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
+ *ilx = 0;
+ return (vma->vm_ops && vma->vm_ops->get_policy) ?
+ vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}
-#endif
-
/*
- * get_vma_policy(@task, @vma, @addr)
- * @task - task for fallback if vma policy == default
- * @vma - virtual memory area whose policy is sought
- * @addr - address in @vma for shared policy lookup
+ * get_vma_policy(@vma, @addr, @order, @ilx)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup
+ * @order: 0, or appropriate huge_page_order for interleaving
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ * MPOL_WEIGHTED_INTERLEAVE
*
* Returns effective policy for a VMA at specified address.
- * Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies must be
- * protected by task_lock(task) by the caller.
+ * Falls back to current->mempolicy or system default policy, as necessary.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-struct mempolicy *get_vma_policy(struct task_struct *task,
- struct vm_area_struct *vma, unsigned long addr)
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr, int order, pgoff_t *ilx)
{
- struct mempolicy *pol = get_task_policy(task);
+ struct mempolicy *pol;
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
- addr);
- if (vpol)
- pol = vpol;
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
+ pol = __get_vma_policy(vma, addr, ilx);
+ if (!pol)
+ pol = get_task_policy(current);
+ if (pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
+ *ilx += vma->vm_pgoff >> order;
+ *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
+ }
+ return pol;
+}
- /*
- * shmem_alloc_page() passes MPOL_F_SHARED policy with
- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
- * count on these policies which will be dropped by
- * mpol_cond_put() later
- */
- if (mpol_needs_cond_ref(pol))
- mpol_get(pol);
- }
+bool vma_policy_mof(struct vm_area_struct *vma)
+{
+ struct mempolicy *pol;
+
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
+ pgoff_t ilx; /* ignored here */
+
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
}
+
+ pol = vma->vm_policy;
if (!pol)
- pol = &default_policy;
- return pol;
+ pol = get_task_policy(current);
+
+ return pol->flags & MPOL_F_MOF;
}
-static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
- * if policy->v.nodes has movable memory only,
+ * if policy->nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
- * policy->v.nodes is intersect with node_states[N_MEMORY].
- * so if the following test faile, it implies
- * policy->v.nodes has movable memory only.
+ * policy->nodes is intersect with node_states[N_MEMORY].
+ * so if the following test fails, it implies
+ * policy->nodes has movable memory only.
*/
- if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
}
-/*
- * Return a nodemask representing a mempolicy for filtering nodes for
- * page allocation
- */
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
- /* Lower zones don't get a nodemask applied for MPOL_BIND */
- if (unlikely(policy->mode == MPOL_BIND) &&
- apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
- return &policy->v.nodes;
-
- return NULL;
-}
+ unsigned int node;
+ unsigned int cpuset_mems_cookie;
-/* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
- int nd)
-{
- switch (policy->mode) {
- case MPOL_PREFERRED:
- if (!(policy->flags & MPOL_F_LOCAL))
- nd = policy->v.preferred_node;
- break;
- case MPOL_BIND:
- /*
- * Normally, MPOL_BIND allocations are node-local within the
- * allowed nodemask. However, if __GFP_THISNODE is set and the
- * current node isn't part of the mask, we use the zonelist for
- * the first node in the mask instead.
- */
- if (unlikely(gfp & __GFP_THISNODE) &&
- unlikely(!node_isset(nd, policy->v.nodes)))
- nd = first_node(policy->v.nodes);
- break;
- default:
- BUG();
+retry:
+ /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ node = current->il_prev;
+ if (!current->il_weight || !node_isset(node, policy->nodes)) {
+ node = next_node_in(node, policy->nodes);
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry;
+ if (node == MAX_NUMNODES)
+ return node;
+ current->il_prev = node;
+ current->il_weight = get_il_weight(node);
}
- return node_zonelist(nd, gfp);
+ current->il_weight--;
+ return node;
}
/* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
+static unsigned int interleave_nodes(struct mempolicy *policy)
{
- unsigned nid, next;
- struct task_struct *me = current;
+ unsigned int nid;
+ unsigned int cpuset_mems_cookie;
+
+ /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nid = next_node_in(current->il_prev, policy->nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
- nid = me->il_next;
- next = next_node(nid, policy->v.nodes);
- if (next >= MAX_NUMNODES)
- next = first_node(policy->v.nodes);
- if (next < MAX_NUMNODES)
- me->il_next = next;
+ if (nid < MAX_NUMNODES)
+ current->il_prev = nid;
return nid;
}
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
- * @policy must be protected by freeing by the caller. If @policy is
- * the current task's mempolicy, this protection is implicit, as only the
- * task can change it's policy. The system default policy requires no
- * such protection.
*/
-unsigned slab_node(void)
+unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
+ int node = numa_mem_id();
- if (in_interrupt())
- return numa_node_id();
+ if (!in_task())
+ return node;
policy = current->mempolicy;
- if (!policy || policy->flags & MPOL_F_LOCAL)
- return numa_node_id();
+ if (!policy)
+ return node;
switch (policy->mode) {
case MPOL_PREFERRED:
- /*
- * handled MPOL_F_LOCAL above
- */
- return policy->v.preferred_node;
+ return first_node(policy->nodes);
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
- case MPOL_BIND: {
+ case MPOL_WEIGHTED_INTERLEAVE:
+ return weighted_interleave_nodes(policy);
+
+ case MPOL_BIND:
+ case MPOL_PREFERRED_MANY:
+ {
+ struct zoneref *z;
+
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
- struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
- (void)first_zones_zonelist(zonelist, highest_zoneidx,
- &policy->v.nodes,
- &zone);
- return zone ? zone->node : numa_node_id();
+ zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
+ z = first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->nodes);
+ return zonelist_zone(z) ? zonelist_node_idx(z) : node;
}
+ case MPOL_LOCAL:
+ return node;
default:
BUG();
}
}
-/* Do static interleaving for a VMA with known offset. */
-static unsigned offset_il_node(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long off)
+static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
+ nodemask_t *mask)
{
- unsigned nnodes = nodes_weight(pol->v.nodes);
- unsigned target;
- int c;
- int nid = -1;
+ /*
+ * barrier stabilizes the nodemask locally so that it can be iterated
+ * over safely without concern for changes. Allocators validate node
+ * selection does not violate mems_allowed, so this is safe.
+ */
+ barrier();
+ memcpy(mask, &pol->nodes, sizeof(nodemask_t));
+ barrier();
+ return nodes_weight(*mask);
+}
- if (!nnodes)
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
+{
+ struct weighted_interleave_state *state;
+ nodemask_t nodemask;
+ unsigned int target, nr_nodes;
+ u8 *table = NULL;
+ unsigned int weight_total = 0;
+ u8 weight;
+ int nid = 0;
+
+ nr_nodes = read_once_policy_nodemask(pol, &nodemask);
+ if (!nr_nodes)
return numa_node_id();
- target = (unsigned int)off % nnodes;
- c = 0;
- do {
- nid = next_node(nid, pol->v.nodes);
- c++;
- } while (c <= target);
+
+ rcu_read_lock();
+
+ state = rcu_dereference(wi_state);
+ /* Uninitialized wi_state means we should assume all weights are 1 */
+ if (state)
+ table = state->iw_table;
+
+ /* calculate the total weight */
+ for_each_node_mask(nid, nodemask)
+ weight_total += table ? table[nid] : 1;
+
+ /* Calculate the node offset based on totals */
+ target = ilx % weight_total;
+ nid = first_node(nodemask);
+ while (target) {
+ /* detect system default usage */
+ weight = table ? table[nid] : 1;
+ if (target < weight)
+ break;
+ target -= weight;
+ nid = next_node_in(nid, nodemask);
+ }
+ rcu_read_unlock();
return nid;
}
-/* Determine a node number for interleave */
-static inline unsigned interleave_nid(struct mempolicy *pol,
- struct vm_area_struct *vma, unsigned long addr, int shift)
+/*
+ * Do static interleaving for interleave index @ilx. Returns the ilx'th
+ * node in pol->nodes (starting from ilx=0), wrapping around if ilx
+ * exceeds the number of present nodes.
+ */
+static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
- if (vma) {
- unsigned long off;
+ nodemask_t nodemask;
+ unsigned int target, nnodes;
+ int i;
+ int nid;
- /*
- * for small pages, there is no difference between
- * shift and PAGE_SHIFT, so the bit-shift is safe.
- * for huge pages, since vm_pgoff is in units of small
- * pages, we need to shift off the always 0 bits to get
- * a useful offset.
- */
- BUG_ON(shift < PAGE_SHIFT);
- off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
- off += (addr - vma->vm_start) >> shift;
- return offset_il_node(pol, vma, off);
- } else
- return interleave_nodes(pol);
+ nnodes = read_once_policy_nodemask(pol, &nodemask);
+ if (!nnodes)
+ return numa_node_id();
+ target = ilx % nnodes;
+ nid = first_node(nodemask);
+ for (i = 0; i < target; i++)
+ nid = next_node(nid, nodemask);
+ return nid;
}
/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns -1 if nodemask is empty)
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation, together with preferred node id (or the input node id).
*/
-int node_random(const nodemask_t *maskp)
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
+ pgoff_t ilx, int *nid)
{
- int w, bit = -1;
+ nodemask_t *nodemask = NULL;
+
+ switch (pol->mode) {
+ case MPOL_PREFERRED:
+ /* Override input node id */
+ *nid = first_node(pol->nodes);
+ break;
+ case MPOL_PREFERRED_MANY:
+ nodemask = &pol->nodes;
+ if (pol->home_node != NUMA_NO_NODE)
+ *nid = pol->home_node;
+ break;
+ case MPOL_BIND:
+ /* Restrict to nodemask (but not on lower zones) */
+ if (apply_policy_zone(pol, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&pol->nodes))
+ nodemask = &pol->nodes;
+ if (pol->home_node != NUMA_NO_NODE)
+ *nid = pol->home_node;
+ /*
+ * __GFP_THISNODE shouldn't even be used with the bind policy
+ * because we might easily break the expectation to stay on the
+ * requested node and not break the policy.
+ */
+ WARN_ON_ONCE(gfp & __GFP_THISNODE);
+ break;
+ case MPOL_INTERLEAVE:
+ /* Override input node id */
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ interleave_nodes(pol) : interleave_nid(pol, ilx);
+ break;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *nid = (ilx == NO_INTERLEAVE_INDEX) ?
+ weighted_interleave_nodes(pol) :
+ weighted_interleave_nid(pol, ilx);
+ break;
+ }
- w = nodes_weight(*maskp);
- if (w)
- bit = bitmap_ord_to_pos(maskp->bits,
- get_random_int() % w, MAX_NUMNODES);
- return bit;
+ return nodemask;
}
#ifdef CONFIG_HUGETLBFS
/*
- * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
- * @vma = virtual memory area whose policy is sought
- * @addr = address in @vma for shared policy lookup and interleave policy
- * @gfp_flags = for requested zone
- * @mpol = pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
+ * huge_node(@vma, @addr, @gfp_flags, @mpol)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags: for requested zone
+ * @mpol: pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
*
- * Returns a zonelist suitable for a huge page allocation and a pointer
+ * Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
- * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * @nodemask for filtering the zonelist.
- *
- * Must be protected by get_mems_allowed()
+ * If the effective policy is 'bind' or 'prefer-many', returns a pointer
+ * to the mempolicy's @nodemask for filtering the zonelist.
*/
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
- gfp_t gfp_flags, struct mempolicy **mpol,
- nodemask_t **nodemask)
+int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
+ struct mempolicy **mpol, nodemask_t **nodemask)
{
- struct zonelist *zl;
-
- *mpol = get_vma_policy(current, vma, addr);
- *nodemask = NULL; /* assume !MPOL_BIND */
+ pgoff_t ilx;
+ int nid;
- if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
- zl = node_zonelist(interleave_nid(*mpol, vma, addr,
- huge_page_shift(hstate_vma(vma))), gfp_flags);
- } else {
- zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
- if ((*mpol)->mode == MPOL_BIND)
- *nodemask = &(*mpol)->v.nodes;
- }
- return zl;
+ nid = numa_node_id();
+ *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
+ *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
+ return nid;
}
/*
@@ -1866,7 +2351,6 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
- int nid;
if (!(mask && current->mempolicy))
return false;
@@ -1875,17 +2359,15 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
- if (mempolicy->flags & MPOL_F_LOCAL)
- nid = numa_node_id();
- else
- nid = mempolicy->v.preferred_node;
- init_nodemask_of_node(mask, nid);
- break;
-
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
- *mask = mempolicy->v.nodes;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ *mask = mempolicy->nodes;
+ break;
+
+ case MPOL_LOCAL:
+ init_nodemask_of_node(mask, numa_node_id());
break;
default:
@@ -1898,16 +2380,16 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
#endif
/*
- * mempolicy_nodemask_intersects
+ * mempolicy_in_oom_domain
*
- * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
- * policy. Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
- * policy, always return true since it may allocate elsewhere on fallback.
+ * If tsk's mempolicy is "bind", check for intersection between mask and
+ * the policy nodemask. Otherwise, return true for all other policies
+ * including "interleave", as a tsk with "interleave" policy may have
+ * memory allocated from all nodes in system.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
-bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
@@ -1915,151 +2397,413 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk,
if (!mask)
return ret;
+
task_lock(tsk);
mempolicy = tsk->mempolicy;
- if (!mempolicy)
- goto out;
-
- switch (mempolicy->mode) {
- case MPOL_PREFERRED:
- /*
- * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
- * allocate from, they may fallback to other nodes when oom.
- * Thus, it's possible for tsk to have allocated memory from
- * nodes in mask.
- */
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- ret = nodes_intersects(mempolicy->v.nodes, *mask);
- break;
- default:
- BUG();
- }
-out:
+ if (mempolicy && mempolicy->mode == MPOL_BIND)
+ ret = nodes_intersects(mempolicy->nodes, *mask);
task_unlock(tsk);
+
return ret;
}
-/* Allocate a page in interleaved policy.
- Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
- unsigned nid)
+static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
+ int nid, nodemask_t *nodemask)
{
- struct zonelist *zl;
struct page *page;
+ gfp_t preferred_gfp;
+
+ /*
+ * This is a two pass approach. The first pass will only try the
+ * preferred nodes but skip the direct reclaim and allow the
+ * allocation to fail, while the second pass will try all the
+ * nodes in system.
+ */
+ preferred_gfp = gfp | __GFP_NOWARN;
+ preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+ page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
+ if (!page)
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
- zl = node_zonelist(nid, gfp);
- page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
- inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}
/**
- * alloc_pages_vma - Allocate a page for a VMA.
- *
- * @gfp:
- * %GFP_USER user allocation.
- * %GFP_KERNEL kernel allocations,
- * %GFP_HIGHMEM highmem/user allocations,
- * %GFP_FS allocation should not call back into a file system.
- * %GFP_ATOMIC don't sleep.
- *
- * @order:Order of the GFP allocation.
- * @vma: Pointer to VMA or NULL if not available.
- * @addr: Virtual Address of the allocation. Must be inside the VMA.
+ * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
+ * @gfp: GFP flags.
+ * @order: Order of the page allocation.
+ * @pol: Pointer to the NUMA mempolicy.
+ * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
+ * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
*
- * This function allocates a page from the kernel page pool and applies
- * a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
- * mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into
- * user space. Returns NULL when no page can be allocated.
- *
- * Should be called with the mm_sem of the vma hold.
+ * Return: The page on success or NULL if allocation fails.
*/
-struct page *
-alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node)
+static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+ struct mempolicy *pol, pgoff_t ilx, int nid)
{
- struct mempolicy *pol;
+ nodemask_t *nodemask;
struct page *page;
- unsigned int cpuset_mems_cookie;
-retry_cpuset:
- pol = get_vma_policy(current, vma, addr);
- cpuset_mems_cookie = get_mems_allowed();
+ nodemask = policy_nodemask(gfp, pol, ilx, &nid);
- if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
- unsigned nid;
+ if (pol->mode == MPOL_PREFERRED_MANY)
+ return alloc_pages_preferred_many(gfp, order, nid, nodemask);
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ /* filter "hugepage" allocation, unless from alloc_pages() */
+ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
+ /*
+ * For hugepage allocation and non-interleave policy which
+ * allows the current node (or other explicitly preferred
+ * node) we only try to allocate from the current/preferred
+ * node and don't fall back to other nodes, as the cost of
+ * remote accesses would likely offset THP benefits.
+ *
+ * If the policy is interleave or does not allow the current
+ * node in its nodemask, we allocate the standard way.
+ */
+ if (pol->mode != MPOL_INTERLEAVE &&
+ pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
+ (!nodemask || node_isset(nid, *nodemask))) {
+ /*
+ * First, try to allocate THP only on local node, but
+ * don't reclaim unnecessarily, just compact.
+ */
+ page = __alloc_frozen_pages_noprof(
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order,
+ nid, NULL);
+ if (page || !(gfp & __GFP_DIRECT_RECLAIM))
+ return page;
+ /*
+ * If hugepage allocations are configured to always
+ * synchronous compact or the vma has been madvised
+ * to prefer hugepage backing, retry allowing remote
+ * memory with both reclaim and compact as well.
+ */
+ }
+ }
+
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
+
+ if (unlikely(pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
+ /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
+ if (static_branch_likely(&vm_numa_stat_key) &&
+ page_to_nid(page) == nid) {
+ preempt_disable();
+ __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
+ preempt_enable();
+ }
+ }
- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
- mpol_cond_put(pol);
- page = alloc_page_interleave(gfp, order, nid);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
- goto retry_cpuset;
-
- return page;
- }
- page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol, node),
- policy_nodemask(gfp, pol));
- if (unlikely(mpol_needs_cond_ref(pol)))
- __mpol_put(pol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
- goto retry_cpuset;
return page;
}
+struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
+ struct mempolicy *pol, pgoff_t ilx, int nid)
+{
+ struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
+ ilx, nid);
+ if (!page)
+ return NULL;
+
+ set_page_refcounted(page);
+ return page_rmappable_folio(page);
+}
+
/**
- * alloc_pages_current - Allocate pages.
+ * vma_alloc_folio - Allocate a folio for a VMA.
+ * @gfp: GFP flags.
+ * @order: Order of the folio.
+ * @vma: Pointer to VMA.
+ * @addr: Virtual address of the allocation. Must be inside @vma.
*
- * @gfp:
- * %GFP_USER user allocation,
- * %GFP_KERNEL kernel allocation,
- * %GFP_HIGHMEM highmem allocation,
- * %GFP_FS don't call back into a file system.
- * %GFP_ATOMIC don't sleep.
- * @order: Power of two of allocation size in pages. 0 is a single page.
+ * Allocate a folio for a specific address in @vma, using the appropriate
+ * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
+ * VMA to prevent it from going away. Should be used for all allocations
+ * for folios that will be mapped into user space, excepting hugetlbfs, and
+ * excepting where direct use of folio_alloc_mpol() is more appropriate.
*
- * Allocate a page from the kernel page pool. When not in
- * interrupt context and apply the current process NUMA policy.
- * Returns NULL when no page can be allocated.
- *
- * Don't call cpuset_update_task_memory_state() unless
- * 1) it's ok to take cpuset_sem (can WAIT), and
- * 2) allocating for current task (not interrupt).
+ * Return: The folio on success or NULL if allocation fails.
*/
-struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct mempolicy *pol = get_task_policy(current);
- struct page *page;
- unsigned int cpuset_mems_cookie;
+ struct mempolicy *pol;
+ pgoff_t ilx;
+ struct folio *folio;
- if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
- pol = &default_policy;
+ if (vma->vm_flags & VM_DROPPABLE)
+ gfp |= __GFP_NOWARN;
-retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ pol = get_vma_policy(vma, addr, order, &ilx);
+ folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
+ mpol_cond_put(pol);
+ return folio;
+}
+EXPORT_SYMBOL(vma_alloc_folio_noprof);
+
+struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
+{
+ struct mempolicy *pol = &default_policy;
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
- if (pol->mode == MPOL_INTERLEAVE)
- page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
- else
- page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol, numa_node_id()),
- policy_nodemask(gfp, pol));
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
- goto retry_cpuset;
+ return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
+ numa_node_id());
+}
+/**
+ * alloc_pages - Allocate pages.
+ * @gfp: GFP flags.
+ * @order: Power of two of number of pages to allocate.
+ *
+ * Allocate 1 << @order contiguous pages. The physical address of the
+ * first page is naturally aligned (eg an order-3 allocation will be aligned
+ * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
+ * process is honoured when in process context.
+ *
+ * Context: Can be called from any context, providing the appropriate GFP
+ * flags are used.
+ * Return: The page on success or NULL if allocation fails.
+ */
+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
+{
+ struct page *page = alloc_frozen_pages_noprof(gfp, order);
+
+ if (page)
+ set_page_refcounted(page);
return page;
}
-EXPORT_SYMBOL(alloc_pages_current);
+EXPORT_SYMBOL(alloc_pages_noprof);
+
+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
+{
+ return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
+}
+EXPORT_SYMBOL(folio_alloc_noprof);
+
+static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ int nodes;
+ unsigned long nr_pages_per_node;
+ int delta;
+ int i;
+ unsigned long nr_allocated;
+ unsigned long total_allocated = 0;
+
+ nodes = nodes_weight(pol->nodes);
+ nr_pages_per_node = nr_pages / nodes;
+ delta = nr_pages - nodes * nr_pages_per_node;
+
+ for (i = 0; i < nodes; i++) {
+ if (delta) {
+ nr_allocated = alloc_pages_bulk_noprof(gfp,
+ interleave_nodes(pol), NULL,
+ nr_pages_per_node + 1,
+ page_array);
+ delta--;
+ } else {
+ nr_allocated = alloc_pages_bulk_noprof(gfp,
+ interleave_nodes(pol), NULL,
+ nr_pages_per_node, page_array);
+ }
+
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ }
+
+ return total_allocated;
+}
+
+static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ struct weighted_interleave_state *state;
+ struct task_struct *me = current;
+ unsigned int cpuset_mems_cookie;
+ unsigned long total_allocated = 0;
+ unsigned long nr_allocated = 0;
+ unsigned long rounds;
+ unsigned long node_pages, delta;
+ u8 *weights, weight;
+ unsigned int weight_total = 0;
+ unsigned long rem_pages = nr_pages;
+ nodemask_t nodes;
+ int nnodes, node;
+ int resume_node = MAX_NUMNODES - 1;
+ u8 resume_weight = 0;
+ int prev_node;
+ int i;
+
+ if (!nr_pages)
+ return 0;
+
+ /* read the nodes onto the stack, retry if done during rebind */
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ nnodes = read_once_policy_nodemask(pol, &nodes);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ /* if the nodemask has become invalid, we cannot do anything */
+ if (!nnodes)
+ return 0;
+
+ /* Continue allocating from most recent node and adjust the nr_pages */
+ node = me->il_prev;
+ weight = me->il_weight;
+ if (weight && node_isset(node, nodes)) {
+ node_pages = min(rem_pages, weight);
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ /* if that's all the pages, no need to interleave */
+ if (rem_pages <= weight) {
+ me->il_weight -= rem_pages;
+ return total_allocated;
+ }
+ /* Otherwise we adjust remaining pages, continue from there */
+ rem_pages -= weight;
+ }
+ /* clear active weight in case of an allocation failure */
+ me->il_weight = 0;
+ prev_node = node;
+
+ /* create a local copy of node weights to operate on outside rcu */
+ weights = kzalloc(nr_node_ids, GFP_KERNEL);
+ if (!weights)
+ return total_allocated;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state) {
+ memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ for (i = 0; i < nr_node_ids; i++)
+ weights[i] = 1;
+ }
+
+ /* calculate total, detect system default usage */
+ for_each_node_mask(node, nodes)
+ weight_total += weights[node];
+
+ /*
+ * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
+ * Track which node weighted interleave should resume from.
+ *
+ * if (rounds > 0) and (delta == 0), resume_node will always be
+ * the node following prev_node and its weight.
+ */
+ rounds = rem_pages / weight_total;
+ delta = rem_pages % weight_total;
+ resume_node = next_node_in(prev_node, nodes);
+ resume_weight = weights[resume_node];
+ for (i = 0; i < nnodes; i++) {
+ node = next_node_in(prev_node, nodes);
+ weight = weights[node];
+ node_pages = weight * rounds;
+ /* If a delta exists, add this node's portion of the delta */
+ if (delta > weight) {
+ node_pages += weight;
+ delta -= weight;
+ } else if (delta) {
+ /* when delta is depleted, resume from that node */
+ node_pages += delta;
+ resume_node = node;
+ resume_weight = weight - delta;
+ delta = 0;
+ }
+ /* node_pages can be 0 if an allocation fails and rounds == 0 */
+ if (!node_pages)
+ break;
+ nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+ page_array);
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ if (total_allocated == nr_pages)
+ break;
+ prev_node = node;
+ }
+ me->il_prev = resume_node;
+ me->il_weight = resume_weight;
+ kfree(weights);
+ return total_allocated;
+}
+
+static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ gfp_t preferred_gfp;
+ unsigned long nr_allocated = 0;
+
+ preferred_gfp = gfp | __GFP_NOWARN;
+ preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+
+ nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
+ nr_pages, page_array);
+
+ if (nr_allocated < nr_pages)
+ nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
+ nr_pages - nr_allocated,
+ page_array + nr_allocated);
+ return nr_allocated;
+}
+
+/* alloc pages bulk and mempolicy should be considered at the
+ * same time in some situation such as vmalloc.
+ *
+ * It can accelerate memory allocation especially interleaving
+ * allocate memory.
+ */
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
+ unsigned long nr_pages, struct page **page_array)
+{
+ struct mempolicy *pol = &default_policy;
+ nodemask_t *nodemask;
+ int nid;
+
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ if (pol->mode == MPOL_INTERLEAVE)
+ return alloc_pages_bulk_interleave(gfp, pol,
+ nr_pages, page_array);
+
+ if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
+ return alloc_pages_bulk_weighted_interleave(
+ gfp, pol, nr_pages, page_array);
+
+ if (pol->mode == MPOL_PREFERRED_MANY)
+ return alloc_pages_bulk_preferred_many(gfp,
+ numa_node_id(), pol, nr_pages, page_array);
+
+ nid = numa_node_id();
+ nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
+ return alloc_pages_bulk_noprof(gfp, nid, nodemask,
+ nr_pages, page_array);
+}
+
+int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+ struct mempolicy *pol = mpol_dup(src->vm_policy);
+
+ if (IS_ERR(pol))
+ return PTR_ERR(pol);
+ dst->vm_policy = pol;
+ return 0;
+}
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
@@ -2088,15 +2832,10 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else
*new = *old;
- rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
- if (new->flags & MPOL_F_REBINDING)
- mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
- else
- mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
+ mpol_rebind_policy(new, &mems);
}
- rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
@@ -2110,17 +2849,21 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return false;
if (a->flags != b->flags)
return false;
+ if (a->home_node != b->home_node)
+ return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
switch (a->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
- return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- return a->v.preferred_node == b->v.preferred_node;
+ case MPOL_PREFERRED_MANY:
+ case MPOL_WEIGHTED_INTERLEAVE:
+ return !!nodes_equal(a->nodes, b->nodes);
+ case MPOL_LOCAL:
+ return true;
default:
BUG();
return false;
@@ -2132,14 +2875,16 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
- * They are protected by the sp->lock spinlock, which should be held
+ * They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
-/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
-static struct sp_node *
-sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+/*
+ * lookup first element intersecting start-end. Caller holds sp->lock for
+ * reading or for writing
+ */
+static struct sp_node *sp_lookup(struct shared_policy *sp,
+ pgoff_t start, pgoff_t end)
{
struct rb_node *n = sp->root.rb_node;
@@ -2168,8 +2913,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
return rb_entry(n, struct sp_node, nd);
}
-/* Insert a new shared policy into the list. */
-/* Caller holds sp->lock */
+/*
+ * Insert a new shared policy into the list. Caller holds sp->lock for
+ * writing.
+ */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
@@ -2188,28 +2935,27 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
- pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
- new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
-struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
+ pgoff_t idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ read_unlock(&sp->lock);
return pol;
}
+EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm");
static void sp_free(struct sp_node *n)
{
@@ -2218,103 +2964,105 @@ static void sp_free(struct sp_node *n)
}
/**
- * mpol_misplaced - check whether current page node is valid in policy
- *
- * @page - page to be checked
- * @vma - vm area where page mapped
- * @addr - virtual address where page mapped
+ * mpol_misplaced - check whether current folio node is valid in policy
*
- * Lookup current policy node id for vma,addr and "compare to" page's
- * node id.
+ * @folio: folio to be checked
+ * @vmf: structure describing the fault
+ * @addr: virtual address in @vma for shared policy lookup and interleave policy
*
- * Returns:
- * -1 - not misplaced, page is in the right node
- * node - node id where the page should be
- *
- * Policy determination "mimics" alloc_page_vma().
+ * Lookup current policy node id for vma,addr and "compare to" folio's
+ * node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
+ *
+ * Return: NUMA_NO_NODE if the page is in a node that is valid for this
+ * policy, or a suitable node ID to allocate a replacement folio from.
*/
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
+ unsigned long addr)
{
struct mempolicy *pol;
- struct zone *zone;
- int curnid = page_to_nid(page);
- unsigned long pgoff;
- int polnid = -1;
- int ret = -1;
+ pgoff_t ilx;
+ struct zoneref *z;
+ int curnid = folio_nid(folio);
+ struct vm_area_struct *vma = vmf->vma;
+ int thiscpu = raw_smp_processor_id();
+ int thisnid = numa_node_id();
+ int polnid = NUMA_NO_NODE;
+ int ret = NUMA_NO_NODE;
- BUG_ON(!vma);
-
- pol = get_vma_policy(current, vma, addr);
+ /*
+ * Make sure ptl is held so that we don't preempt and we
+ * have a stable smp processor id
+ */
+ lockdep_assert_held(vmf->ptl);
+ pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
- BUG_ON(addr >= vma->vm_end);
- BUG_ON(addr < vma->vm_start);
+ polnid = interleave_nid(pol, ilx);
+ break;
- pgoff = vma->vm_pgoff;
- pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
- polnid = offset_il_node(pol, vma, pgoff);
+ case MPOL_WEIGHTED_INTERLEAVE:
+ polnid = weighted_interleave_nid(pol, ilx);
break;
case MPOL_PREFERRED:
- if (pol->flags & MPOL_F_LOCAL)
- polnid = numa_node_id();
- else
- polnid = pol->v.preferred_node;
+ if (node_isset(curnid, pol->nodes))
+ goto out;
+ polnid = first_node(pol->nodes);
+ break;
+
+ case MPOL_LOCAL:
+ polnid = numa_node_id();
break;
case MPOL_BIND:
+ case MPOL_PREFERRED_MANY:
+ /*
+ * Even though MPOL_PREFERRED_MANY can allocate pages outside
+ * policy nodemask we don't allow numa migration to nodes
+ * outside policy nodemask for now. This is done so that if we
+ * want demotion to slow memory to happen, before allocating
+ * from some DRAM node say 'x', we will end up using a
+ * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
+ * we should not promote to node 'x' from slow memory node.
+ */
+ if (pol->flags & MPOL_F_MORON) {
+ /*
+ * Optimize placement among multiple nodes
+ * via NUMA balancing
+ */
+ if (node_isset(thisnid, pol->nodes))
+ break;
+ goto out;
+ }
+
/*
- * allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
- if (node_isset(curnid, pol->v.nodes))
+ if (node_isset(curnid, pol->nodes))
goto out;
- (void)first_zones_zonelist(
- node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ z = first_zones_zonelist(
+ node_zonelist(thisnid, GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
- &pol->v.nodes, &zone);
- polnid = zone->node;
+ &pol->nodes);
+ polnid = zonelist_node_idx(z);
break;
default:
BUG();
}
- /* Migrate the page towards the node whose CPU is referencing it */
+ /* Migrate the folio towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- int last_nid;
-
- polnid = numa_node_id();
+ polnid = thisnid;
- /*
- * Multi-stage node selection is used in conjunction
- * with a periodic migration fault to build a temporal
- * task<->page relation. By using a two-stage filter we
- * remove short/unlikely relations.
- *
- * Using P(p) ~ n_p / n_t as per frequentist
- * probability, we can equate a task's usage of a
- * particular page (n_p) per total usage of this
- * page (n_t) (in a given time-span) to a probability.
- *
- * Our periodic faults will sample this probability and
- * getting the same result twice in a row, given these
- * samples are fully independent, is then given by
- * P(n)^2, provided our sample period is sufficiently
- * short compared to the usage pattern.
- *
- * This quadric squishes small probabilities, making
- * it less likely we act on an unlikely task<->page
- * relation.
- */
- last_nid = page_nid_xchg_last(page, polnid);
- if (last_nid != polnid)
+ if (!should_numa_migrate_memory(current, folio, curnid,
+ thiscpu))
goto out;
}
@@ -2326,9 +3074,25 @@ out:
return ret;
}
+/*
+ * Drop the (possibly final) reference to task->mempolicy. It needs to be
+ * dropped after task->mempolicy is set to NULL so that any allocation done as
+ * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
+ * policy.
+ */
+void mpol_put_task_policy(struct task_struct *task)
+{
+ struct mempolicy *pol;
+
+ task_lock(task);
+ pol = task->mempolicy;
+ task->mempolicy = NULL;
+ task_unlock(task);
+ mpol_put(pol);
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
- pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
sp_free(n);
}
@@ -2363,8 +3127,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
}
/* Replace a policy range. */
-static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
- unsigned long end, struct sp_node *new)
+static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
+ pgoff_t end, struct sp_node *new)
{
struct sp_node *n;
struct sp_node *n_new = NULL;
@@ -2372,7 +3136,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
int ret = 0;
restart:
- spin_lock(&sp->lock);
+ write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2405,7 +3169,7 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = 0;
err_out:
@@ -2417,7 +3181,7 @@ err_out:
return ret;
alloc_new:
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
@@ -2425,6 +3189,7 @@ alloc_new:
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
+ atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
@@ -2443,84 +3208,81 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ rwlock_init(&sp->lock);
if (mpol) {
- struct vm_area_struct pvma;
- struct mempolicy *new;
+ struct sp_node *sn;
+ struct mempolicy *npol;
NODEMASK_SCRATCH(scratch);
if (!scratch)
goto put_mpol;
- /* contextualize the tmpfs mount point mempolicy */
- new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
- if (IS_ERR(new))
+
+ /* contextualize the tmpfs mount point mempolicy to this file */
+ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+ if (IS_ERR(npol))
goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
- ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
+ ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
task_unlock(current);
if (ret)
- goto put_new;
-
- /* Create pseudo-vma that contains just the policy */
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- pvma.vm_end = TASK_SIZE; /* policy covers entire file */
- mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
-
-put_new:
- mpol_put(new); /* drop initial ref */
+ goto put_npol;
+
+ /* alloc node covering entire file; adds ref to file's npol */
+ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
+ if (sn)
+ sp_insert(sp, sn);
+put_npol:
+ mpol_put(npol); /* drop initial ref on file's npol */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
+EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm");
-int mpol_set_shared_policy(struct shared_policy *info,
- struct vm_area_struct *vma, struct mempolicy *npol)
+int mpol_set_shared_policy(struct shared_policy *sp,
+ struct vm_area_struct *vma, struct mempolicy *pol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
- pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
- vma->vm_pgoff,
- sz, npol ? npol->mode : -1,
- npol ? npol->flags : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
-
- if (npol) {
- new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+ if (pol) {
+ new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
if (!new)
return -ENOMEM;
}
- err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+ err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
if (err && new)
sp_free(new);
return err;
}
+EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm");
/* Free a backing policy store on inode delete. */
-void mpol_free_shared_policy(struct shared_policy *p)
+void mpol_free_shared_policy(struct shared_policy *sp)
{
struct sp_node *n;
struct rb_node *next;
- if (!p->root.rb_node)
+ if (!sp->root.rb_node)
return;
- spin_lock(&p->lock);
- next = rb_first(&p->root);
+ write_lock(&sp->lock);
+ next = rb_first(&sp->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- sp_delete(p, n);
+ sp_delete(sp, n);
}
- spin_unlock(&p->lock);
+ write_unlock(&sp->lock);
}
+EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm");
#ifdef CONFIG_NUMA_BALANCING
-static bool __initdata numabalancing_override;
+static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
@@ -2529,9 +3291,13 @@ static void __init check_numabalancing_enable(void)
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
- if (nr_node_ids > 1 && !numabalancing_override) {
- printk(KERN_INFO "Enabling automatic NUMA balancing. "
- "Configure with numa_balancing= or sysctl");
+ /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
+ if (numabalancing_override)
+ set_numabalancing_state(numabalancing_override == 1);
+
+ if (num_online_nodes() > 1 && !numabalancing_override) {
+ pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
+ numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
@@ -2541,18 +3307,17 @@ static int __init setup_numabalancing(char *str)
int ret = 0;
if (!str)
goto out;
- numabalancing_override = true;
if (!strcmp(str, "enable")) {
- set_numabalancing_state(true);
+ numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
- set_numabalancing_state(false);
+ numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
- printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+ pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
@@ -2563,7 +3328,6 @@ static inline void __init check_numabalancing_enable(void)
}
#endif /* CONFIG_NUMA_BALANCING */
-/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
@@ -2583,7 +3347,7 @@ void __init numa_policy_init(void)
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
- .v = { .preferred_node = nid, },
+ .nodes = nodemask_of_node(nid),
};
}
@@ -2612,7 +3376,7 @@ void __init numa_policy_init(void)
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
- printk("numa_policy_init: interleaving failed\n");
+ pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
@@ -2626,20 +3390,17 @@ void numa_default_policy(void)
/*
* Parse and format mempolicy from/to strings
*/
-
-/*
- * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
- */
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
+ [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
+ [MPOL_PREFERRED_MANY] = "prefer (many)",
};
-
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
@@ -2649,17 +3410,19 @@ static const char * const policy_modes[] =
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * On success, returns 0, else 1
+ * Return: %0 on success, else %1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
- unsigned short mode;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int err = 1;
+ int err = 1, mode;
+
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
if (nodelist) {
/* NUL-terminate mode or flags string */
@@ -2671,21 +3434,16 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
} else
nodes_clear(nodes);
- if (flags)
- *flags++ = '\0'; /* terminate mode string */
-
- for (mode = 0; mode < MPOL_MAX; mode++) {
- if (!strcmp(str, policy_modes[mode])) {
- break;
- }
- }
- if (mode >= MPOL_MAX)
+ mode = match_string(policy_modes, MPOL_MAX, str);
+ if (mode < 0)
goto out;
switch (mode) {
case MPOL_PREFERRED:
/*
- * Insist on a nodelist of one node only
+ * Insist on a nodelist of one node only, although later
+ * we use first_node(nodes) to grab a single node, so here
+ * nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
@@ -2693,9 +3451,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
rest++;
if (*rest)
goto out;
+ if (nodes_empty(nodes))
+ goto out;
}
break;
case MPOL_INTERLEAVE:
+ case MPOL_WEIGHTED_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
@@ -2708,7 +3469,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
*/
if (nodelist)
goto out;
- mode = MPOL_PREFERRED;
break;
case MPOL_DEFAULT:
/*
@@ -2717,6 +3477,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
if (!nodelist)
err = 0;
goto out;
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
@@ -2747,12 +3508,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
- if (mode != MPOL_PREFERRED)
- new->v.nodes = nodes;
- else if (nodelist)
- new->v.preferred_node = first_node(nodes);
- else
- new->flags |= MPOL_F_LOCAL;
+ if (mode != MPOL_PREFERRED) {
+ new->nodes = nodes;
+ } else if (nodelist) {
+ nodes_clear(new->nodes);
+ node_set(first_node(nodes), new->nodes);
+ } else {
+ new->mode = MPOL_LOCAL;
+ }
/*
* Save nodes for contextualization: this will be used to "clone"
@@ -2780,77 +3543,402 @@ out:
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
- * Convert a mempolicy into a string.
- * Returns the number of characters in buffer (if positive)
- * or an error (negative)
+ * Convert @pol into a string. If @buffer is too short, truncate the string.
+ * Recommend a @maxlen of at least 51 for the longest mode, "weighted
+ * interleave", plus the longest flag flags, "relative|balancing", and to
+ * display at least a few node ids.
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
- int l;
- nodemask_t nodes;
- unsigned short mode;
- unsigned short flags = pol ? pol->flags : 0;
-
- /*
- * Sanity check: room for longest mode, flag and some nodes
- */
- VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
-
- if (!pol || pol == &default_policy)
- mode = MPOL_DEFAULT;
- else
+ nodemask_t nodes = NODE_MASK_NONE;
+ unsigned short mode = MPOL_DEFAULT;
+ unsigned short flags = 0;
+
+ if (pol &&
+ pol != &default_policy &&
+ !(pol >= &preferred_node_policy[0] &&
+ pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
mode = pol->mode;
+ flags = pol->flags;
+ }
switch (mode) {
case MPOL_DEFAULT:
- nodes_clear(nodes);
+ case MPOL_LOCAL:
break;
-
case MPOL_PREFERRED:
- nodes_clear(nodes);
- if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL;
- else
- node_set(pol->v.preferred_node, nodes);
- break;
-
+ case MPOL_PREFERRED_MANY:
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
- nodes = pol->v.nodes;
+ case MPOL_WEIGHTED_INTERLEAVE:
+ nodes = pol->nodes;
break;
-
default:
- return -EINVAL;
+ WARN_ON_ONCE(1);
+ snprintf(p, maxlen, "unknown");
+ return;
}
- l = strlen(policy_modes[mode]);
- if (buffer + maxlen < p + l + 1)
- return -ENOSPC;
-
- strcpy(p, policy_modes[mode]);
- p += l;
+ p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
- if (buffer + maxlen < p + 2)
- return -ENOSPC;
- *p++ = '=';
+ p += snprintf(p, buffer + maxlen - p, "=");
/*
- * Currently, the only defined flags are mutually exclusive
+ * Static and relative are mutually exclusive.
*/
if (flags & MPOL_F_STATIC_NODES)
p += snprintf(p, buffer + maxlen - p, "static");
else if (flags & MPOL_F_RELATIVE_NODES)
p += snprintf(p, buffer + maxlen - p, "relative");
+
+ if (flags & MPOL_F_NUMA_BALANCING) {
+ if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
+ p += snprintf(p, buffer + maxlen - p, "|");
+ p += snprintf(p, buffer + maxlen - p, "balancing");
+ }
+ }
+
+ if (!nodes_empty(nodes))
+ p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
+ nodemask_pr_args(&nodes));
+}
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+ struct kobj_attribute kobj_attr;
+ int nid;
+};
+
+struct sysfs_wi_group {
+ struct kobject wi_kobj;
+ struct mutex kobj_lock;
+ struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct iw_node_attr *node_attr;
+ u8 weight;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ weight = get_il_weight(node_attr->nid);
+ return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ struct iw_node_attr *node_attr;
+ u8 weight = 0;
+ int i;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ if (count == 0 || sysfs_streq(buf, "") ||
+ kstrtou8(buf, 0, &weight) || weight == 0)
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
+
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state) {
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ } else {
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+ }
+ new_wi_state->iw_table[node_attr->nid] = weight;
+ new_wi_state->mode_auto = false;
+
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct weighted_interleave_state *state;
+ bool wi_auto = true;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ wi_auto = state->mode_auto;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
+}
+
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *bw;
+ bool input;
+ int i;
+
+ if (kstrtobool(buf, &input))
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ mutex_lock(&wi_state_lock);
+ if (!input) {
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state)
+ goto update_wi_state;
+ if (input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ return count;
+ }
+
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ goto update_wi_state;
+ }
+
+ bw = node_bw_table;
+ if (!bw) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return -ENODEV;
+ }
+
+ new_wi_state->mode_auto = true;
+ reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static void sysfs_wi_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->nattrs[nid];
+ if (!attr) {
+ mutex_unlock(&wi_group->kobj_lock);
+ return;
+ }
+
+ wi_group->nattrs[nid] = NULL;
+ mutex_unlock(&wi_group->kobj_lock);
+
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+}
+
+static void sysfs_wi_node_delete_all(void)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(nid);
+}
+
+static void wi_state_free(void)
+{
+ struct weighted_interleave_state *old_wi_state;
+
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ rcu_assign_pointer(wi_state, NULL);
+ mutex_unlock(&wi_state_lock);
+
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+}
+
+static struct kobj_attribute wi_auto_attr =
+ __ATTR(auto, 0664, weighted_interleave_auto_show,
+ weighted_interleave_auto_store);
+
+static void wi_cleanup(void) {
+ sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ sysfs_wi_node_delete_all();
+ wi_state_free();
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ kfree(wi_group);
+}
+
+static const struct kobj_type wi_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = wi_kobj_release,
+};
+
+static int sysfs_wi_node_add(int nid)
+{
+ int ret;
+ char *name;
+ struct iw_node_attr *new_attr;
+
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ name = kasprintf(GFP_KERNEL, "node%d", nid);
+ if (!name) {
+ kfree(new_attr);
+ return -ENOMEM;
+ }
+
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = node_show;
+ new_attr->kobj_attr.store = node_store;
+ new_attr->nid = nid;
+
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->nattrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->nattrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
+ return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
+}
+
+static int wi_node_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ int err;
+ struct node_notify *nn = data;
+ int nid = nn->nid;
+
+ switch (action) {
+ case NODE_ADDED_FIRST_MEMORY:
+ err = sysfs_wi_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs for node%d during hotplug: %d\n",
+ nid, err);
+ break;
+ case NODE_REMOVED_LAST_MEMORY:
+ sysfs_wi_node_delete(nid);
+ break;
}
- if (!nodes_empty(nodes)) {
- if (buffer + maxlen < p + 2)
- return -ENOSPC;
- *p++ = ':';
- p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+ return NOTIFY_OK;
+}
+
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
+{
+ int nid, err;
+
+ wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+ GFP_KERNEL);
+ if (!wi_group)
+ return -ENOMEM;
+ mutex_init(&wi_group->kobj_lock);
+
+ err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
+ "weighted_interleave");
+ if (err)
+ goto err_put_kobj;
+
+ err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ if (err)
+ goto err_put_kobj;
+
+ for_each_online_node(nid) {
+ if (!node_state(nid, N_MEMORY))
+ continue;
+
+ err = sysfs_wi_node_add(nid);
+ if (err) {
+ pr_err("failed to add sysfs for node%d during init: %d\n",
+ nid, err);
+ goto err_cleanup_kobj;
+ }
}
- return p - buffer;
+
+ hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ return 0;
+
+err_cleanup_kobj:
+ wi_cleanup();
+ kobject_del(&wi_group->wi_kobj);
+err_put_kobj:
+ kobject_put(&wi_group->wi_kobj);
+ return err;
}
+
+static int __init mempolicy_sysfs_init(void)
+{
+ int err;
+ static struct kobject *mempolicy_kobj;
+
+ mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!mempolicy_kobj)
+ return -ENOMEM;
+
+ err = add_weighted_interleave_group(mempolicy_kobj);
+ if (err)
+ goto err_kobj;
+
+ return 0;
+
+err_kobj:
+ kobject_del(mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
+ return err;
+}
+
+late_initcall(mempolicy_sysfs_init);
+#endif /* CONFIG_SYSFS */