summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/sched/numa_balancing.h1
-rw-r--r--include/trace/events/sched.h3
-rw-r--r--kernel/sched/fair.c55
4 files changed, 61 insertions, 4 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e7571eca1131..589f31ef2e84 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -575,6 +575,12 @@ struct vma_numab_state {
* every VMA_PID_RESET_PERIOD jiffies:
*/
unsigned long pids_active[2];
+
+ /*
+ * MM scan sequence ID when the VMA was last completely scanned.
+ * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
+ */
+ int prev_scan_seq;
};
/*
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 7dcc0bdfddbb..b69afb8630db 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -22,6 +22,7 @@ enum numa_vmaskip_reason {
NUMAB_SKIP_SCAN_DELAY,
NUMAB_SKIP_PID_INACTIVE,
NUMAB_SKIP_IGNORE_PID,
+ NUMAB_SKIP_SEQ_COMPLETED,
};
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index bfc07c10541a..6188ad0d9e0d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -671,7 +671,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \
EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \
EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \
- EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" )
+ EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \
+ EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" )
/* Redefine for export. */
#undef EM
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab79013f6e91..922905194c0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3158,6 +3158,8 @@ static void task_numa_work(struct callback_head *work)
unsigned long nr_pte_updates = 0;
long pages, virtpages;
struct vma_iterator vmi;
+ bool vma_pids_skipped;
+ bool vma_pids_forced = false;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
@@ -3200,7 +3202,6 @@ static void task_numa_work(struct callback_head *work)
*/
p->node_stamp += 2 * TICK_NSEC;
- start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
@@ -3210,6 +3211,16 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
+
+ /*
+ * VMAs are skipped if the current PID has not trapped a fault within
+ * the VMA recently. Allow scanning to be forced if there is no
+ * suitable VMA remaining.
+ */
+ vma_pids_skipped = false;
+
+retry_pids:
+ start = mm->numa_scan_offset;
vma_iter_init(&vmi, mm, start);
vma = vma_next(&vmi);
if (!vma) {
@@ -3260,6 +3271,13 @@ static void task_numa_work(struct callback_head *work)
/* Reset happens after 4 times scan delay of scan start */
vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+ /*
+ * Ensure prev_scan_seq does not match numa_scan_seq,
+ * to prevent VMAs being skipped prematurely on the
+ * first scan:
+ */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
}
/*
@@ -3281,8 +3299,19 @@ static void task_numa_work(struct callback_head *work)
vma->numab_state->pids_active[1] = 0;
}
- /* Do not scan the VMA if task has not accessed */
- if (!vma_is_accessed(mm, vma)) {
+ /* Do not rescan VMAs twice within the same sequence. */
+ if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+ mm->numa_scan_offset = vma->vm_end;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
+ continue;
+ }
+
+ /*
+ * Do not scan the VMA if task has not accessed it, unless no other
+ * VMA candidate exists.
+ */
+ if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+ vma_pids_skipped = true;
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
continue;
}
@@ -3311,8 +3340,28 @@ static void task_numa_work(struct callback_head *work)
cond_resched();
} while (end != vma->vm_end);
+
+ /* VMA scan is complete, do not scan until next sequence. */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+ /*
+ * Only force scan within one VMA at a time, to limit the
+ * cost of scanning a potentially uninteresting VMA.
+ */
+ if (vma_pids_forced)
+ break;
} for_each_vma(vmi, vma);
+ /*
+ * If no VMAs are remaining and VMAs were skipped due to the PID
+ * not accessing the VMA previously, then force a scan to ensure
+ * forward progress:
+ */
+ if (!vma && !vma_pids_forced && vma_pids_skipped) {
+ vma_pids_forced = true;
+ goto retry_pids;
+ }
+
out:
/*
* It is possible to reach the end of the VMA list but the last few