diff options
-rw-r--r-- | Documentation/admin-guide/sysctl/kernel.rst | 11 | ||||
-rw-r--r-- | include/linux/mmzone.h | 7 | ||||
-rw-r--r-- | include/linux/sched/sysctl.h | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 33 | ||||
-rw-r--r-- | kernel/sysctl.c | 8 | ||||
-rw-r--r-- | mm/vmstat.c | 1 |
6 files changed, 59 insertions, 2 deletions
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index ee6572b1edad..835c8844bba4 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -635,6 +635,17 @@ different types of memory (represented as different NUMA nodes) to place the hot pages in the fast memory. This is implemented based on unmapping and page fault too. +numa_balancing_promote_rate_limit_MBps +====================================== + +Too high promotion/demotion throughput between different memory types +may hurt application latency. This can be used to rate limit the +promotion throughput. The per-node max promotion throughput in MB/s +will be limited to be no more than the set value. + +A rule of thumb is to set this to less than 1/10 of the PMEM node +write bandwidth. + oops_all_cpu_backtrace ====================== diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8f571dc7c524..a0003eaa751f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,6 +221,7 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ + PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif NR_VM_NODE_STAT_ITEMS }; @@ -998,6 +999,12 @@ typedef struct pglist_data { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_NUMA_BALANCING + /* start time in ms of current promote rate limit period */ + unsigned int nbp_rl_start; + /* number of promote candidate pages at start time of current rate limit period */ + unsigned long nbp_rl_nr_cand; +#endif /* Fields commonly accessed by the page reclaim scanner */ /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index e650946816d0..303ee7dd0c7e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -27,6 +27,7 @@ enum sched_tunable_scaling { #ifdef CONFIG_NUMA_BALANCING extern int sysctl_numa_balancing_mode; +extern unsigned int sysctl_numa_balancing_promote_rate_limit; #else #define sysctl_numa_balancing_mode 0 #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06db566c7660..1d1dd88daaab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1097,6 +1097,9 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000; /* The page with hint page fault latency < threshold in ms is considered hot */ unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + struct numa_group { refcount_t refcount; @@ -1501,6 +1504,29 @@ static int numa_hint_fault_latency(struct page *page) return (time - last_time) & PAGE_ACCESS_TIME_MASK; } +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1515,7 +1541,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !node_is_toptier(src_nid)) { struct pglist_data *pgdat; - unsigned long latency, th; + unsigned long rate_limit, latency, th; pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) @@ -1526,7 +1552,10 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, if (latency >= th) return false; - return true; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 205d605cacc5..f10a610aa834 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1641,6 +1641,14 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, }, + { + .procname = "numa_balancing_promote_rate_limit_MBps", + .data = &sysctl_numa_balancing_promote_rate_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #endif /* CONFIG_NUMA_BALANCING */ { .procname = "panic", diff --git a/mm/vmstat.c b/mm/vmstat.c index 90af9a8572f5..c109167a669c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1252,6 +1252,7 @@ const char * const vmstat_text[] = { #endif #ifdef CONFIG_NUMA_BALANCING "pgpromote_success", + "pgpromote_candidate", #endif /* enum writeback_stat_item counters */ |