summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/kernel.txt11
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched/fair.c88
3 files changed, 83 insertions, 17 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1428c6659254..8cd7e5fc79da 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -403,15 +403,16 @@ workload pattern changes and minimises performance impact due to remote
memory accesses. These sysctls control the thresholds for scan delays and
the number of pages scanned.
-numa_balancing_scan_period_min_ms is the minimum delay in milliseconds
-between scans. It effectively controls the maximum scanning rate for
-each task.
+numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the maximum scanning
+rate for each task.
numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
when it initially forks.
-numa_balancing_scan_period_max_ms is the maximum delay between scans. It
-effectively controls the minimum scanning rate for each task.
+numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the minimum scanning
+rate for each task.
numa_balancing_scan_size_mb is how many megabytes worth of pages are
scanned for a given scan.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2ac5285db434..fdcb4c855072 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1339,6 +1339,7 @@ struct task_struct {
int numa_scan_seq;
int numa_migrate_seq;
unsigned int numa_scan_period;
+ unsigned int numa_scan_period_max;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
#endif /* CONFIG_NUMA_BALANCING */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0966f0c16f1b..e08d757720de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -818,11 +818,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_NUMA_BALANCING
/*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
*/
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,6 +832,51 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+ unsigned long rss = 0;
+ unsigned long nr_scan_pages;
+
+ /*
+ * Calculations based on RSS as non-present and empty pages are skipped
+ * by the PTE scanner and NUMA hinting faults should be trapped based
+ * on resident pages
+ */
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ rss = get_mm_rss(p->mm);
+ if (!rss)
+ rss = nr_scan_pages;
+
+ rss = round_up(rss, nr_scan_pages);
+ return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
+{
+ unsigned int scan, floor;
+ unsigned int windows = 1;
+
+ if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ floor = 1000 / windows;
+
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+ return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+ unsigned int smin = task_scan_min(p);
+ unsigned int smax;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+ return max(smin, smax);
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq;
@@ -840,6 +887,7 @@ static void task_numa_placement(struct task_struct *p)
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
/* FIXME: Scheduling placement policy hints go here */
}
@@ -860,9 +908,14 @@ void task_numa_fault(int node, int pages, bool migrated)
* If pages are properly placed (did not migrate) then scan slower.
* This is reset periodically in case of phase changes
*/
- if (!migrated)
- p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
- p->numa_scan_period + jiffies_to_msecs(10));
+ if (!migrated) {
+ /* Initialise if necessary */
+ if (!p->numa_scan_period_max)
+ p->numa_scan_period_max = task_scan_max(p);
+
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period + 10);
+ }
task_numa_placement(p);
}
@@ -884,6 +937,7 @@ void task_numa_work(struct callback_head *work)
struct mm_struct *mm = p->mm;
struct vm_area_struct *vma;
unsigned long start, end;
+ unsigned long nr_pte_updates = 0;
long pages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -915,7 +969,7 @@ void task_numa_work(struct callback_head *work)
*/
migrate = mm->numa_next_reset;
if (time_after(now, migrate)) {
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ p->numa_scan_period = task_scan_min(p);
next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
xchg(&mm->numa_next_reset, next_scan);
}
@@ -927,8 +981,10 @@ void task_numa_work(struct callback_head *work)
if (time_before(now, migrate))
return;
- if (p->numa_scan_period == 0)
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ if (p->numa_scan_period == 0) {
+ p->numa_scan_period_max = task_scan_max(p);
+ p->numa_scan_period = task_scan_min(p);
+ }
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
@@ -965,7 +1021,15 @@ void task_numa_work(struct callback_head *work)
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
- pages -= change_prot_numa(vma, start, end);
+ nr_pte_updates += change_prot_numa(vma, start, end);
+
+ /*
+ * Scan sysctl_numa_balancing_scan_size but ensure that
+ * at least one PTE is updated so that unused virtual
+ * address space is quickly skipped.
+ */
+ if (nr_pte_updates)
+ pages -= (end - start) >> PAGE_SHIFT;
start = end;
if (pages <= 0)
@@ -1012,7 +1076,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now - curr->node_stamp > period) {
if (!curr->node_stamp)
- curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ curr->numa_scan_period = task_scan_min(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {