From c6833e10008f976a173dd5abdf992e492cbc3bcf Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 13 Jul 2022 16:39:52 +0800 Subject: memory tiering: rate limit NUMA migration throughput In NUMA balancing memory tiering mode, if there are hot pages in slow memory node and cold pages in fast memory node, we need to promote/demote hot/cold pages between the fast and cold memory nodes. A choice is to promote/demote as fast as possible. But the CPU cycles and memory bandwidth consumed by the high promoting/demoting throughput will hurt the latency of some workload because of accessing inflating and slow memory bandwidth contention. A way to resolve this issue is to restrict the max promoting/demoting throughput. It will take longer to finish the promoting/demoting. But the workload latency will be better. This is implemented in this patch as the page promotion rate limit mechanism. The number of the candidate pages to be promoted to the fast memory node via NUMA balancing is counted, if the count exceeds the limit specified by the users, the NUMA balancing promotion will be stopped until the next second. A new sysctl knob kernel.numa_balancing_promote_rate_limit_MBps is added for the users to specify the limit. Link: https://lkml.kernel.org/r/20220713083954.34196-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: osalvador Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Wei Xu Cc: Yang Shi Cc: Zhong Jiang Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 33 +++++++++++++++++++++++++++++++-- kernel/sysctl.c | 8 ++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06db566c7660..1d1dd88daaab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1097,6 +1097,9 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000; /* The page with hint page fault latency < threshold in ms is considered hot */ unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + struct numa_group { refcount_t refcount; @@ -1501,6 +1504,29 @@ static int numa_hint_fault_latency(struct page *page) return (time - last_time) & PAGE_ACCESS_TIME_MASK; } +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1515,7 +1541,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !node_is_toptier(src_nid)) { struct pglist_data *pgdat; - unsigned long latency, th; + unsigned long rate_limit, latency, th; pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) @@ -1526,7 +1552,10 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, if (latency >= th) return false; - return true; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 205d605cacc5..f10a610aa834 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1641,6 +1641,14 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, }, + { + .procname = "numa_balancing_promote_rate_limit_MBps", + .data = &sysctl_numa_balancing_promote_rate_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #endif /* CONFIG_NUMA_BALANCING */ { .procname = "panic", -- cgit