From 800977f6f32e452cba6b04ef21d2f5383ca29209 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:52 -0800 Subject: kthread: add the helper function kthread_run_on_cpu() Add a new helper function kthread_run_on_cpu(), which includes kthread_create_on_cpu/wake_up_process(). In some cases, use kthread_run_on_cpu() directly instead of kthread_create_on_node/kthread_bind/wake_up_process() or kthread_create_on_cpu/wake_up_process() or kthreadd_create/kthread_bind/wake_up_process() to simplify the code. [akpm@linux-foundation.org: export kthread_create_on_cpu to modules] Link: https://lkml.kernel.org/r/20211022025711.3673-2-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Cai Huoqing Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 25 +++++++++++++++++++++++++ kernel/kthread.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..db47aae7c481 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -56,6 +56,31 @@ bool kthread_is_per_cpu(struct task_struct *k); __k; \ }) +/** + * kthread_run_on_cpu - create and wake a cpu bound thread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_on_cpu() + * followed by wake_up_process(). Returns the kthread or + * ERR_PTR(-ENOMEM). + */ +static inline struct task_struct * +kthread_run_on_cpu(int (*threadfn)(void *data), void *data, + unsigned int cpu, const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_cpu(threadfn, data, cpu, namefmt); + if (!IS_ERR(p)) + wake_up_process(p); + + return p; +} + void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63..4ed9e7bce9e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -523,6 +523,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), to_kthread(p)->cpu = cpu; return p; } +EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { -- cgit From e0850113937b843c69b50b5d9087978ae4254be7 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:55 -0800 Subject: RDMA/siw: make use of the helper function kthread_run_on_cpu() Replace kthread_create/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-3-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/sw/siw/siw_main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 9093e6a80b26..e5c586913d0b 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -98,15 +98,14 @@ static int siw_create_tx_threads(void) continue; siw_tx_thread[cpu] = - kthread_create(siw_run_sq, (unsigned long *)(long)cpu, - "siw_tx/%d", cpu); + kthread_run_on_cpu(siw_run_sq, + (unsigned long *)(long)cpu, + cpu, "siw_tx/%u"); if (IS_ERR(siw_tx_thread[cpu])) { siw_tx_thread[cpu] = NULL; continue; } - kthread_bind(siw_tx_thread[cpu], cpu); - wake_up_process(siw_tx_thread[cpu]); assigned++; } return assigned; -- cgit From 64ed3a049e3e81b801e7c5bb052416152443f585 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:59 -0800 Subject: ring-buffer: make use of the helper function kthread_run_on_cpu() Replace kthread_create/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-4-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/ring_buffer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2699e9e562b1..05dfc7a12d3d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5898,16 +5898,13 @@ static __init int test_ringbuffer(void) rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; - rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], - "rbtester/%d", cpu); + rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], + cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } - - kthread_bind(rb_threads[cpu], cpu); - wake_up_process(rb_threads[cpu]); } /* Now create the rb hammer! */ -- cgit From 3b9cb4ba4b54ecc6cf7d04ea9085d2ad2be48733 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:02 -0800 Subject: rcutorture: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_node/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-5-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcu/rcutorture.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b410d982990..42bc66a2f170 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1992,9 +1992,8 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL, + cpu, "rcu_torture_boost_%u"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); @@ -2003,8 +2002,6 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_unlock(&boost_mutex); return retval; } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); mutex_unlock(&boost_mutex); return 0; } -- cgit From 11e4e3523da98c065a6c249013ace0d388e41c25 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:06 -0800 Subject: trace/osnoise: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_cpu/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-6-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_osnoise.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 7520d43aed55..89d6cbac6f10 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1701,7 +1701,7 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "osnoise/%d", cpu); } - kthread = kthread_create_on_cpu(main, NULL, cpu, comm); + kthread = kthread_run_on_cpu(main, NULL, cpu, comm); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); @@ -1710,7 +1710,6 @@ static int start_kthread(unsigned int cpu) } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } -- cgit From ff78f6679d2e223e073fcbdc8f70b6bc0abadf99 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:10 -0800 Subject: trace/hwlat: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_cpu/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-7-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_hwlat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 56bb7b890578..d440ddd5fd8b 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -491,18 +491,14 @@ static void stop_per_cpu_kthreads(void) static int start_cpu_kthread(unsigned int cpu) { struct task_struct *kthread; - char comm[24]; - snprintf(comm, 24, "hwlatd/%d", cpu); - - kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm); + kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); return -ENOMEM; } per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } -- cgit From f2fed022aa0a1bce86ca02e16f6c5832711e6424 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Fri, 14 Jan 2022 14:03:13 -0800 Subject: ia64: module: use swap() to make code cleaner Use the macro 'swap()' defined in 'include/linux/minmax.h' to avoid opencoding it. Link: https://lkml.kernel.org/r/20211104062642.1506539-1-yang.guang5@zte.com.cn Signed-off-by: Yang Guang Reported-by: Zeal Robot Cc: David Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/module.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 2cba53c1da82..360f36b0eb3f 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -848,7 +848,7 @@ register_unwind_table (struct module *mod) { struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr; struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start); - struct unw_table_entry tmp, *e1, *e2, *core, *init; + struct unw_table_entry *e1, *e2, *core, *init; unsigned long num_init = 0, num_core = 0; /* First, count how many init and core unwind-table entries there are. */ @@ -865,9 +865,7 @@ register_unwind_table (struct module *mod) for (e1 = start; e1 < end; ++e1) { for (e2 = e1 + 1; e2 < end; ++e2) { if (e2->start_offset < e1->start_offset) { - tmp = *e1; - *e1 = *e2; - *e2 = tmp; + swap(*e1, *e2); } } } -- cgit From 6c4420b09267050bc47b3999d80457a8dabaeb89 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Fri, 14 Jan 2022 14:03:16 -0800 Subject: arch/ia64/kernel/setup.c: use swap() to make code cleaner Use the macro 'swap()' defined in 'include/linux/minmax.h' to avoid opencoding it. Link: https://lkml.kernel.org/r/20211104001908.695110-1-yang.guang5@zte.com.cn Reported-by: Zeal Robot Signed-off-by: Yang Guang Cc: David Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/setup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 31fb84de2d21..5010348fa21b 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -208,10 +208,7 @@ sort_regions (struct rsvd_region *rsvd_region, int max) while (max--) { for (j = 0; j < max; ++j) { if (rsvd_region[j].start > rsvd_region[j+1].start) { - struct rsvd_region tmp; - tmp = rsvd_region[j]; - rsvd_region[j] = rsvd_region[j + 1]; - rsvd_region[j + 1] = tmp; + swap(rsvd_region[j], rsvd_region[j + 1]); } } } -- cgit From c5c2135412bdb11b419e41a0c128e423d8bf4f65 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 14 Jan 2022 14:03:19 -0800 Subject: ia64: fix typo in a comment The double `the' in a comment is repeated, thus it should be removed. Link: https://lkml.kernel.org/r/20211113030316.22650-1-wangborong@cdjrlc.com Signed-off-by: Jason Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/uncached.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 51883a66aeb5..816803636a75 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -171,7 +171,7 @@ failed: * @n_pages: number of contiguous pages to allocate * * Allocate the specified number of contiguous uncached pages on the - * the requested node. If not enough contiguous uncached pages are available + * requested node. If not enough contiguous uncached pages are available * on the requested node, roundrobin starting with the next higher node. */ unsigned long uncached_alloc_page(int starting_nid, int n_pages) -- cgit From a7eddfc92bbd2463a89dd22011c047509e6c52a8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 14 Jan 2022 14:03:22 -0800 Subject: ia64: topology: use default_groups in kobj_type There are currently two ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the ia64 topology sysfs code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Link: https://lkml.kernel.org/r/20220104154800.1287947-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Cc: Mike Rapoport Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 3639e0a7cb3b..e4992917a24b 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -264,6 +264,7 @@ static struct attribute * cache_default_attrs[] = { &shared_cpu_map.attr, NULL }; +ATTRIBUTE_GROUPS(cache_default); #define to_object(k) container_of(k, struct cache_info, kobj) #define to_attr(a) container_of(a, struct cache_attr, attr) @@ -284,7 +285,7 @@ static const struct sysfs_ops cache_sysfs_ops = { static struct kobj_type cache_ktype = { .sysfs_ops = &cache_sysfs_ops, - .default_attrs = cache_default_attrs, + .default_groups = cache_default_groups, }; static struct kobj_type cache_ktype_percpu_entry = { -- cgit From 9a69f2b0e4180dc289d0f68842f9e8b146e926f7 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Fri, 14 Jan 2022 14:03:25 -0800 Subject: scripts/spelling.txt: add "oveflow" Add typo "oveflow" for "overflow". This typo was found and fixed in tools/testing/selftests/bpf/prog_tests/btf_dump.c Link: https://lore.kernel.org/all/20211122070528.837806-1-dfustini@baylibre.com/ Link: https://lkml.kernel.org/r/20211122072302.839102-1-dfustini@baylibre.com Signed-off-by: Drew Fustini Suggested-by: Gustavo A. R. Silva Cc: Colin Ian King Cc: Drew Fustini Cc: zuoqilin Cc: Tom Saeger Cc: Sven Eckelmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index acf6ea711299..0c8b79cfb1bb 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1046,6 +1046,7 @@ oustanding||outstanding overaall||overall overhread||overhead overlaping||overlapping +oveflow||overflow overflw||overflow overlfow||overflow overide||override -- cgit From 7e0af97853954afd995598ac8dac670a734ade17 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 14 Jan 2022 14:03:28 -0800 Subject: fs/ntfs/attrib.c: fix one kernel-doc comment The comments for the file should not be in kernel-doc format: /** * attrib.c - NTFS attribute operations. Part of the Linux-NTFS as it causes it to be incorrectly identified for function ntfs_map_runlist_nolock(), causing some warnings found by running scripts/kernel-doc.: fs/ntfs/attrib.c:25: warning: Incorrect use of kernel-doc format: * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode fs/ntfs/attrib.c:71: warning: Function parameter or member 'ni' not described in 'ntfs_map_runlist_nolock' fs/ntfs/attrib.c:71: warning: Function parameter or member 'vcn' not described in 'ntfs_map_runlist_nolock' fs/ntfs/attrib.c:71: warning: Function parameter or member 'ctx' not described in 'ntfs_map_runlist_nolock' fs/ntfs/attrib.c:71: warning: expecting prototype for attrib.c - NTFS attribute operations. Part of the Linux(). Prototype was for ntfs_map_runlist_nolock() instead Link: https://lkml.kernel.org/r/20220106015145.67067-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Acked-by: Randy Dunlap Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/attrib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index d563abc3e136..2911c04a33e0 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. -- cgit From 9eec1d897139e5de287af5d559a02b811b844d82 Mon Sep 17 00:00:00 2001 From: Zheng Liang Date: Fri, 14 Jan 2022 14:03:31 -0800 Subject: squashfs: provide backing_dev_info in order to disable read-ahead Commit c1f6925e1091 ("mm: put readahead pages in cache earlier") causes the read performance of squashfs to deteriorate.Through testing, we find that the performance will be back by closing the readahead of squashfs. So we want to learn the way of ubifs, provides backing_dev_info and disable read-ahead We tested the following data by fio. squashfs image blocksize=128K test command: fio --name basic --bs=? --filename="/mnt/test_file" --rw=? --iodepth=1 --ioengine=psync --runtime=200 --time_based turn on squashfs readahead in 5.10 kernel bs(k) read/randread MB/s 4 randread 271 128 randread 231 1024 randread 246 4 read 310 128 read 245 1024 read 247 turn off squashfs readahead in 5.10 kernel bs(k) read/randread MB/s 4 randread 293 128 randread 330 1024 randread 363 4 read 338 128 read 360 1024 read 365 turn on squashfs readahead and revert the commit c1f6925e1091("mm: put readahead pages in cache earlier") in 5.10 kernel bs(k) read/randread MB/s 4 randread 289 128 randread 306 1024 randread 335 4 read 337 128 read 336 1024 read 338 Link: https://lkml.kernel.org/r/20211116113141.1391026-1-zhengliang6@huawei.com Signed-off-by: Zheng Liang Reviewed-by: Phillip Lougher Cc: Zhang Yi Cc: Hou Tao Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/super.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index bb44ff4c5cc6..b1b556dbce12 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -112,6 +113,24 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } +static int squashfs_bdi_init(struct super_block *sb) +{ + int err; + unsigned int major = MAJOR(sb->s_dev); + unsigned int minor = MINOR(sb->s_dev); + + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + + err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); + if (err) + return err; + + sb->s_bdi->ra_pages = 0; + sb->s_bdi->io_pages = 0; + + return 0; +} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -127,6 +146,20 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); + /* + * squashfs provides 'backing_dev_info' in order to disable read-ahead. For + * squashfs, I/O is not deferred, it is done immediately in readpage, + * which means the user would always have to wait their own I/O. So the effect + * of readahead is very weak for squashfs. squashfs_bdi_init will set + * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for + * squashfs. + */ + err = squashfs_bdi_init(sb); + if (err) { + errorf(fc, "squashfs init bdi failed"); + return err; + } + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); -- cgit From 783cc68d6143da3c8bc6322b80abd96640f6066f Mon Sep 17 00:00:00 2001 From: Zhang Mingyu Date: Fri, 14 Jan 2022 14:03:35 -0800 Subject: ocfs2: use BUG_ON instead of if condition followed by BUG. This issue was detected with the help of Coccinelle. Link: https://lkml.kernel.org/r/20211105014424.75372-1-zhang.mingyu@zte.com.cn Signed-off-by: Zhang Mingyu Reported-by: Zeal Robot Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index dbf9b9e97d74..1887a2708709 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1669,8 +1669,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); goto done; } @@ -1699,8 +1698,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (status < 0) mlog_errno(status); - if (!igrab(inode)) - BUG(); + BUG_ON(!igrab(inode)); jbd2_journal_destroy(journal); -- cgit From e07bf00c40c6cce051ca7f95a6050d2a195b4f98 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 14 Jan 2022 14:03:38 -0800 Subject: ocfs2: clearly handle ocfs2_grab_pages_for_write() return value ocfs2_grab_pages_for_write() may return -EAGAIN if write context type is mmap and it could not lock the target page. In this case, we exit with no error and no target page. And then trigger the caller page_mkwrite() to retry. Since there are other caller types, e.g. buffer and direct io, make the return value handling more clear. Link: https://lkml.kernel.org/r/20211206065051.103353-1-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reported-by: Dan Carpenter Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 68d11c295dd3..498da317580a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1799,20 +1799,20 @@ try_again: */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); - if (ret && ret != -EAGAIN) { - mlog_errno(ret); - goto out_quota; - } + if (ret) { + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; + } - /* - * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock - * the target page. In this case, we exit with no error and no target - * page. This will trigger the caller, page_mkwrite(), to re-try - * the operation. - */ - if (ret == -EAGAIN) { - BUG_ON(wc->w_target_page); - ret = 0; + mlog_errno(ret); goto out_quota; } -- cgit From 59430cc1141caf75840bd69877b59d7bf292829e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 14 Jan 2022 14:03:41 -0800 Subject: ocfs2: use default_groups in kobj_type There are currently two ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the ocfs2 code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Link: https://lkml.kernel.org/r/20211228144517.391660-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/filecheck.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index de56e6231af8..1ad7106741f8 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -94,6 +94,7 @@ static struct attribute *ocfs2_filecheck_attrs[] = { &ocfs2_filecheck_attr_set.attr, NULL }; +ATTRIBUTE_GROUPS(ocfs2_filecheck); static void ocfs2_filecheck_release(struct kobject *kobj) { @@ -138,7 +139,7 @@ static const struct sysfs_ops ocfs2_filecheck_ops = { }; static struct kobj_type ocfs2_ktype_filecheck = { - .default_attrs = ocfs2_filecheck_attrs, + .default_groups = ocfs2_filecheck_groups, .sysfs_ops = &ocfs2_filecheck_ops, .release = ocfs2_filecheck_release, }; -- cgit From f018844f834a2fc3bc7ba5f6915d5020e930a086 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 14 Jan 2022 14:03:45 -0800 Subject: ocfs2: remove redundant assignment to pointer root_bh The variable 'root_bh' is being initialized with a value that is not read, it is being re-assigned later on closer to its use. The early initialization is redundant and can be removed. Link: https://lkml.kernel.org/r/20211228013719.620923-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bb247bc349e4..bf9357123bc5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2040,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, int i, idx; struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_rec *left_rec, *right_rec; - struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + struct buffer_head *root_bh; /* * Update the counts and position values within all the -- cgit From d141b39b398460391b98b817fa6284773e842c45 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 14 Jan 2022 14:03:48 -0800 Subject: ocfs2: cluster: use default_groups in kobj_type There are currently two ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the ocfs2 cluster sysfs code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Link: https://lkml.kernel.org/r/20220106102028.3345634-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reviewed-by: Joseph Qi Tested-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/masklog.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 810d32815593..563881ddbf00 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -120,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(KTHREAD), }; -static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; +static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, }; +ATTRIBUTE_GROUPS(mlog_default); static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, char *buf) @@ -144,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = { }; static struct kobj_type mlog_ktype = { - .default_attrs = mlog_attr_ptrs, - .sysfs_ops = &mlog_attr_ops, + .default_groups = mlog_default_groups, + .sysfs_ops = &mlog_attr_ops, }; static struct kset mlog_kset = { @@ -157,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset) int i = 0; while (mlog_attrs[i].attr.mode) { - mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + mlog_default_attrs[i] = &mlog_attrs[i].attr; i++; } - mlog_attr_ptrs[i] = NULL; + mlog_default_attrs[i] = NULL; kobject_set_name(&mlog_kset.kobj, "logmask"); mlog_kset.kobj.kset = o2cb_kset; -- cgit From 9a25d051502ca1f19af3fd8e196c408a4a9c9fbb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 14 Jan 2022 14:03:51 -0800 Subject: ocfs2: remove redundant assignment to variable free_space The variable 'free_space' is being initialized with a value that is not read, it is being re-assigned later in the two paths of an if statement. The early initialization is redundant and can be removed. Link: https://lkml.kernel.org/r/20220112230411.1090761-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index bd8d534f11cb..f2cc1ff29e6d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3343,7 +3343,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_dir_entry *de, *last_de = NULL; char *de_buf, *limit; unsigned long offset = 0; - unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + unsigned int rec_len, new_rec_len, free_space; /* * This calculates how many free bytes we'd have in block zero, should -- cgit From a12cf8b32ceed9c60c8bba7c46077ebffbfb9db2 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 14 Jan 2022 14:03:55 -0800 Subject: fs/ioctl: remove unnecessary __user annotation __user annotations are used by the checker (e.g sparse) to mark user pointers. However here __user is applied to a struct directly, without a pointer being directly involved. Although the presence of __user does not cause sparse to emit a warning, __user should be removed for consistency with other uses of offsetof(). Note: No functional changes intended. Link: https://lkml.kernel.org/r/20211122101256.7875-1-amit.kachhap@arm.com Signed-off-by: Amit Daniel Kachhap Cc: Vincenzo Frascino Cc: Kevin Brodsky Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 504e69578112..1ed097e94af2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -430,7 +430,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range __user, info[count]); + size = offsetof(struct file_dedupe_range, info[count]); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; -- cgit From 7302e91f39a81a9c2efcf4bc5749d18128366945 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Jan 2022 14:03:58 -0800 Subject: mm/slab_common: use WARN() if cache still has objects on destroy Calling kmem_cache_destroy() while the cache still has objects allocated is a kernel bug, and will usually result in the entire cache being leaked. While the message in kmem_cache_destroy() resembles a warning, it is currently not implemented using a real WARN(). This is problematic for infrastructure testing the kernel, all of which rely on the specific format of WARN()s to pick up on bugs. Some 13 years ago this used to be a simple WARN_ON() in slub, but commit d629d8195793 ("slub: improve kmem_cache_destroy() error message") changed it into an open-coded warning to avoid confusion with a bug in slub itself. Instead, turn the open-coded warning into a real WARN() with the message preserved, so that test systems can actually identify these issues, and we get all the other benefits of using a normal WARN(). The warning message is extended with "when called from " to make it even clearer where the fault lies. For most configurations this is only a cosmetic change, however, note that WARN() here will now also respect panic_on_warn. Link: https://lkml.kernel.org/r/20211102170733.648216-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index e5d080a93009..c6213f18eb3a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -489,8 +489,6 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int err; - if (unlikely(!s)) return; @@ -501,12 +499,9 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - err = shutdown_cache(s); - if (err) { - pr_err("%s %s: Slab cache still has objects\n", - __func__, s->name); - dump_stack(); - } + WARN(shutdown_cache(s), + "%s %s: Slab cache still has objects when called from %pS", + __func__, s->name, (void *)_RET_IP_); out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); -- cgit From c29b5b3d33a61e122cb493917ba51c82bcac4121 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 14 Jan 2022 14:04:01 -0800 Subject: mm: slab: make slab iterator functions static There is no external users of slab_start/next/stop(), so make them static. And the memory.kmem.slabinfo is deprecated, which outputs nothing now, so move memcg_slab_show() into mm/memcontrol.c and rename it to mem_cgroup_slab_show to be consistent with other function names. Link: https://lkml.kernel.org/r/20211109133359.32881-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 ++++++++++++- mm/slab.h | 5 ----- mm/slab_common.c | 17 +++-------------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2ed5f2a0879d..3542237c833f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4845,6 +4845,17 @@ out_kfree: return ret; } +#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +static int mem_cgroup_slab_show(struct seq_file *m, void *p) +{ + /* + * Deprecated. + * Please, take a look at tools/cgroup/slabinfo.py . + */ + return 0; +} +#endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -4945,7 +4956,7 @@ static struct cftype mem_cgroup_legacy_files[] = { (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", - .seq_show = memcg_slab_show, + .seq_show = mem_cgroup_slab_show, }, #endif { diff --git a/mm/slab.h b/mm/slab.h index 56ad7eea3ddf..053eefaf6cbd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -575,11 +575,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif -void *slab_start(struct seq_file *m, loff_t *pos); -void *slab_next(struct seq_file *m, void *p, loff_t *pos); -void slab_stop(struct seq_file *m, void *p); -int memcg_slab_show(struct seq_file *m, void *p); - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index c6213f18eb3a..b7c431819cdb 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1039,18 +1039,18 @@ static void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -void *slab_start(struct seq_file *m, loff_t *pos) +static void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); return seq_list_start(&slab_caches, *pos); } -void *slab_next(struct seq_file *m, void *p, loff_t *pos) +static void *slab_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &slab_caches, pos); } -void slab_stop(struct seq_file *m, void *p) +static void slab_stop(struct seq_file *m, void *p) { mutex_unlock(&slab_mutex); } @@ -1118,17 +1118,6 @@ void dump_unreclaimable_slab(void) mutex_unlock(&slab_mutex); } -#if defined(CONFIG_MEMCG_KMEM) -int memcg_slab_show(struct seq_file *m, void *p) -{ - /* - * Deprecated. - * Please, take a look at tools/cgroup/slabinfo.py . - */ - return 0; -} -#endif - /* * slabinfo_op - iterator that generates /proc/slabinfo * -- cgit From ad1a3e15fcd3b8ba0f5f60f6a2fe3938274fdf65 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Fri, 14 Jan 2022 14:04:04 -0800 Subject: kmemleak: fix kmemleak false positive report with HW tag-based kasan enable With HW tag-based kasan enable, We will get the warning when we free object whose address starts with 0xFF. It is because kmemleak rbtree stores tagged object and this freeing object's tag does not match with rbtree object. In the example below, kmemleak rbtree stores the tagged object in the kmalloc(), and kfree() gets the pointer with 0xFF tag. Call sequence: ptr = kmalloc(size, GFP_KERNEL); page = virt_to_page(ptr); offset = offset_in_page(ptr); kfree(page_address(page) + offset); ptr = kmalloc(size, GFP_KERNEL); A sequence like that may cause the warning as following: 1) Freeing unknown object: In kfree(), we will get free unknown object warning in kmemleak_free(). Because object(0xFx) in kmemleak rbtree and pointer(0xFF) in kfree() have different tag. 2) Overlap existing: When we allocate that object with the same hw-tag again, we will find the overlap in the kmemleak rbtree and kmemleak thread will be killed. kmemleak: Freeing unknown object at 0xffff000003f88000 CPU: 5 PID: 177 Comm: cat Not tainted 5.16.0-rc1-dirty #21 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1ac show_stack+0x1c/0x30 dump_stack_lvl+0x68/0x84 dump_stack+0x1c/0x38 kmemleak_free+0x6c/0x70 slab_free_freelist_hook+0x104/0x200 kmem_cache_free+0xa8/0x3d4 test_version_show+0x270/0x3a0 module_attr_show+0x28/0x40 sysfs_kf_seq_show+0xb0/0x130 kernfs_seq_show+0x30/0x40 seq_read_iter+0x1bc/0x4b0 seq_read_iter+0x1bc/0x4b0 kernfs_fop_read_iter+0x144/0x1c0 generic_file_splice_read+0xd0/0x184 do_splice_to+0x90/0xe0 splice_direct_to_actor+0xb8/0x250 do_splice_direct+0x88/0xd4 do_sendfile+0x2b0/0x344 __arm64_sys_sendfile64+0x164/0x16c invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0x44/0xec do_el0_svc+0x74/0x90 el0_svc+0x20/0x80 el0t_64_sync_handler+0x1a8/0x1b0 el0t_64_sync+0x1ac/0x1b0 ... kmemleak: Cannot insert 0xf2ff000003f88000 into the object search tree (overlaps existing) CPU: 5 PID: 178 Comm: cat Not tainted 5.16.0-rc1-dirty #21 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1ac show_stack+0x1c/0x30 dump_stack_lvl+0x68/0x84 dump_stack+0x1c/0x38 create_object.isra.0+0x2d8/0x2fc kmemleak_alloc+0x34/0x40 kmem_cache_alloc+0x23c/0x2f0 test_version_show+0x1fc/0x3a0 module_attr_show+0x28/0x40 sysfs_kf_seq_show+0xb0/0x130 kernfs_seq_show+0x30/0x40 seq_read_iter+0x1bc/0x4b0 kernfs_fop_read_iter+0x144/0x1c0 generic_file_splice_read+0xd0/0x184 do_splice_to+0x90/0xe0 splice_direct_to_actor+0xb8/0x250 do_splice_direct+0x88/0xd4 do_sendfile+0x2b0/0x344 __arm64_sys_sendfile64+0x164/0x16c invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0x44/0xec do_el0_svc+0x74/0x90 el0_svc+0x20/0x80 el0t_64_sync_handler+0x1a8/0x1b0 el0t_64_sync+0x1ac/0x1b0 kmemleak: Kernel memory leak detector disabled kmemleak: Object 0xf2ff000003f88000 (size 128): kmemleak: comm "cat", pid 177, jiffies 4294921177 kmemleak: min_count = 1 kmemleak: count = 0 kmemleak: flags = 0x1 kmemleak: checksum = 0 kmemleak: backtrace: kmem_cache_alloc+0x23c/0x2f0 test_version_show+0x1fc/0x3a0 module_attr_show+0x28/0x40 sysfs_kf_seq_show+0xb0/0x130 kernfs_seq_show+0x30/0x40 seq_read_iter+0x1bc/0x4b0 kernfs_fop_read_iter+0x144/0x1c0 generic_file_splice_read+0xd0/0x184 do_splice_to+0x90/0xe0 splice_direct_to_actor+0xb8/0x250 do_splice_direct+0x88/0xd4 do_sendfile+0x2b0/0x344 __arm64_sys_sendfile64+0x164/0x16c invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0x44/0xec do_el0_svc+0x74/0x90 kmemleak: Automatic memory scanning thread ended [akpm@linux-foundation.org: whitespace tweak] Link: https://lkml.kernel.org/r/20211118054426.4123-1-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Reviewed-by: Catalin Marinas Cc: Doug Berger Cc: Mel Gorman Cc: Peter Zijlstra Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b57383c17cf6..dc3758fdba68 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object) static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { struct rb_node *rb = object_tree_root.rb_node; + unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); while (rb) { - struct kmemleak_object *object = - rb_entry(rb, struct kmemleak_object, rb_node); - if (ptr < object->pointer) + struct kmemleak_object *object; + unsigned long untagged_objp; + + object = rb_entry(rb, struct kmemleak_object, rb_node); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + + if (untagged_ptr < untagged_objp) rb = object->rb_node.rb_left; - else if (object->pointer + object->size <= ptr) + else if (untagged_objp + object->size <= untagged_ptr) rb = object->rb_node.rb_right; - else if (object->pointer == ptr || alias) + else if (untagged_objp == untagged_ptr || alias) return object; else { kmemleak_warn("Found object by alias at 0x%08lx\n", @@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; unsigned long untagged_ptr; + unsigned long untagged_objp; object = mem_pool_alloc(gfp); if (!object) { @@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, while (*link) { rb_parent = *link; parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); - if (ptr + size <= parent->pointer) + untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer); + if (untagged_ptr + size <= untagged_objp) link = &parent->rb_node.rb_left; - else if (parent->pointer + parent->size <= ptr) + else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", -- cgit From 972fa3a7c17c9d60212e32ecc0205dc585b1e769 Mon Sep 17 00:00:00 2001 From: Calvin Zhang Date: Fri, 14 Jan 2022 14:04:08 -0800 Subject: mm: kmemleak: alloc gray object for reserved region with direct map Reserved regions with direct mapping may contain references to other regions. CMA region with fixed location is reserved without creating kmemleak_object for it. So add them as gray kmemleak objects. Link: https://lkml.kernel.org/r/20211123090641.3654006-1-calvinzhang.cool@gmail.com Signed-off-by: Calvin Zhang Cc: Rob Herring Cc: Frank Rowand Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/fdt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index bdca35284ceb..116c582fea7a 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -26,6 +26,7 @@ #include #include #include +#include #include /* for COMMAND_LINE_SIZE */ #include @@ -522,9 +523,12 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, size = dt_mem_next_cell(dt_root_size_cells, &prop); if (size && - early_init_dt_reserve_memory_arch(base, size, nomap) == 0) + early_init_dt_reserve_memory_arch(base, size, nomap) == 0) { pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); + if (!nomap) + kmemleak_alloc_phys(base, size, 0, 0); + } else pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); -- cgit From 60115fa54ad7b913b7cb5844e6b7ffeb842d55f2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jan 2022 14:04:11 -0800 Subject: mm: defer kmemleak object creation of module_alloc() Yongqiang reports a kmemleak panic when module insmod/rmmod with KASAN enabled(without KASAN_VMALLOC) on x86[1]. When the module area allocates memory, it's kmemleak_object is created successfully, but the KASAN shadow memory of module allocation is not ready, so when kmemleak scan the module's pointer, it will panic due to no shadow memory with KASAN check. module_alloc __vmalloc_node_range kmemleak_vmalloc kmemleak_scan update_checksum kasan_module_alloc kmemleak_ignore Note, there is no problem if KASAN_VMALLOC enabled, the modules area entire shadow memory is preallocated. Thus, the bug only exits on ARCH which supports dynamic allocation of module area per module load, for now, only x86/arm64/s390 are involved. Add a VM_DEFER_KMEMLEAK flags, defer vmalloc'ed object register of kmemleak in module_alloc() to fix this issue. [1] https://lore.kernel.org/all/6d41e2b9-4692-5ec4-b1cd-cbe29ae89739@huawei.com/ [wangkefeng.wang@huawei.com: fix build] Link: https://lkml.kernel.org/r/20211125080307.27225-1-wangkefeng.wang@huawei.com [akpm@linux-foundation.org: simplify ifdefs, per Andrey] Link: https://lkml.kernel.org/r/CA+fCnZcnwJHUQq34VuRxpdoY6_XbJCDJ-jopksS5Eia4PijPzw@mail.gmail.com Link: https://lkml.kernel.org/r/20211124142034.192078-1-wangkefeng.wang@huawei.com Fixes: 793213a82de4 ("s390/kasan: dynamic shadow mem allocation for modules") Fixes: 39d114ddc682 ("arm64: add KASAN support") Fixes: bebf56a1b176 ("kasan: enable instrumentation of global variables") Signed-off-by: Kefeng Wang Reported-by: Yongqiang Liu Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Alexander Gordeev Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/module.c | 4 ++-- arch/s390/kernel/module.c | 5 +++-- arch/x86/kernel/module.c | 7 ++++--- include/linux/kasan.h | 4 ++-- include/linux/vmalloc.h | 7 +++++++ mm/kasan/shadow.c | 9 +++++++-- mm/vmalloc.c | 3 ++- 7 files changed, 27 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index b5ec010c481f..309a27553c87 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -36,7 +36,7 @@ void *module_alloc(unsigned long size) module_alloc_end = MODULES_END; p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_end, gfp_mask, PAGE_KERNEL, 0, + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b01ba460b7ca..d52d85367bf7 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -37,14 +37,15 @@ void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 169fb6f4cd2e..95fa745e310a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) @@ -74,10 +75,10 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + MODULES_END, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d8783b682669..89c99e5e67de 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -474,12 +474,12 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, * allocations with real shadow memory. With KASAN vmalloc, the special * case is unnecessary, as the work is handled in the generic case. */ -int kasan_module_alloc(void *addr, size_t size); +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } +static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6e022cc712e6..880227b9f044 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -28,6 +28,13 @@ struct notifier_block; /* in notifier.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ +#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ + !defined(CONFIG_KASAN_VMALLOC) +#define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ +#else +#define VM_DEFER_KMEMLEAK 0 +#endif + /* * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. * diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 4a4929b29a23..94136f84b449 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size) +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -520,9 +520,14 @@ int kasan_module_alloc(void *addr, size_t size) __builtin_return_address(0)); if (ret) { + struct vm_struct *vm = find_vm_area(addr); __memset(ret, KASAN_SHADOW_INIT, shadow_size); - find_vm_area(addr)->flags |= VM_KASAN; + vm->flags |= VM_KASAN; kmemleak_ignore(ret); + + if (vm->flags & VM_DEFER_KMEMLEAK) + kmemleak_vmalloc(vm, size, gfp_mask); + return 0; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d2a00ad4e1dd..bf3c2fe8f528 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3074,7 +3074,8 @@ again: clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); - kmemleak_vmalloc(area, size, gfp_mask); + if (!(vm_flags & VM_DEFER_KMEMLEAK)) + kmemleak_vmalloc(area, size, gfp_mask); return addr; -- cgit From 5b24eeef06701cca6852f1bf768248ccc912819b Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:15 -0800 Subject: mm/page_alloc: split prep_compound_page into head and tail subparts Patch series "mm, device-dax: Introduce compound pages in devmap", v7. This series converts device-dax to use compound pages, and moves away from the 'struct page per basepage on PMD/PUD' that is done today. Doing so 1) unlocks a few noticeable improvements on unpin_user_pages() and makes device-dax+altmap case 4x times faster in pinning (numbers below and in last patch) 2) as mentioned in various other threads it's one important step towards cleaning up ZONE_DEVICE refcounting. I've split the compound pages on devmap part from the rest based on recent discussions on devmap pending and future work planned[5][6]. There is consensus that device-dax should be using compound pages to represent its PMD/PUDs just like HugeTLB and THP, and that leads to less specialization of the dax parts. I will pursue the rest of the work in parallel once this part is merged, particular the GUP-{slow,fast} improvements [7] and the tail struct page deduplication memory savings part[8]. To summarize what the series does: Patch 1: Prepare hwpoisoning to work with dax compound pages. Patches 2-3: Split the current utility function of prep_compound_page() into head and tail and use those two helpers where appropriate to take advantage of caches being warm after __init_single_page(). This is used when initializing zone device when we bring up device-dax namespaces. Patches 4-10: Add devmap support for compound pages in device-dax. memmap_init_zone_device() initialize its metadata as compound pages, and it introduces a new devmap property known as vmemmap_shift which outlines how the vmemmap is structured (defaults to base pages as done today). The property describe the page order of the metadata essentially. While at it do a few cleanups in device-dax in patches 5-9. Finally enable device-dax usage of devmap @vmemmap_shift to a value based on its own @align property. @vmemmap_shift returns 0 by default (which is today's case of base pages in devmap, like fsdax or the others) and the usage of compound devmap is optional. Starting with device-dax (*not* fsdax) we enable it by default. There are a few pinning improvements particular on the unpinning case and altmap, as well as unpin_user_page_range_dirty_lock() being just as effective as THP/hugetlb[0] pages. $ gup_test -f /dev/dax1.0 -m 16384 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~71 ms -> put:~22 ms [altmap] (pin_user_pages_fast 2M pages) get:~524ms put:~525 ms -> get: ~127ms put:~71ms $ gup_test -f /dev/dax1.0 -m 129022 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~513 ms -> put:~188 ms [altmap with -m 127004] (pin_user_pages_fast 2M pages) get:~4.1 secs put:~4.12 secs -> get:~1sec put:~563ms Tested on x86 with 1Tb+ of pmem (alongside registering it with RDMA with and without altmap), alongside gup_test selftests with dynamic dax regions and static dax regions. Coupled with ndctl unit tests for dynamic dax devices that exercise all of this. Note, for dynamic dax regions I had to revert commit 8aa83e6395 ("x86/setup: Call early_reserve_memory() earlier"), it is a known issue that this commit broke efi_fake_mem=. This patch (of 11): Split the utility function prep_compound_page() into head and tail counterparts, and use them accordingly. This is in preparation for sharing the storage for compound page metadata. Link: https://lkml.kernel.org/r/20211202204422.26777-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20211202204422.26777-3-joao.m.martins@oracle.com Signed-off-by: Joao Martins Acked-by: Mike Kravetz Reviewed-by: Dan Williams Reviewed-by: Muchun Song Cc: Vishal Verma Cc: Dave Jiang Cc: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jane Chu Cc: Jonathan Corbet Cc: Christoph Hellwig Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5952749ad40..20b9db0cf97c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -726,23 +726,33 @@ void free_compound_page(struct page *page) free_the_page(page, compound_order(page)); } +static void prep_compound_head(struct page *page, unsigned int order) +{ + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + set_compound_order(page, order); + atomic_set(compound_mapcount_ptr(page), -1); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); +} + +static void prep_compound_tail(struct page *head, int tail_idx) +{ + struct page *p = head + tail_idx; + + p->mapping = TAIL_MAPPING; + set_compound_head(p, head); +} + void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - p->mapping = TAIL_MAPPING; - set_compound_head(p, page); - } + for (i = 1; i < nr_pages; i++) + prep_compound_tail(page, i); - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + prep_compound_head(page, order); } #ifdef CONFIG_DEBUG_PAGEALLOC -- cgit From 46487e0095f895c25da9feae27dc06d2aa76793d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:18 -0800 Subject: mm/page_alloc: refactor memmap_init_zone_device() page init Move struct page init to an helper function __init_zone_device_page(). This is in preparation for sharing the storage for compound page metadata. Link: https://lkml.kernel.org/r/20211202204422.26777-4-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 74 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20b9db0cf97c..23045a2a1339 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6572,6 +6572,46 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone } #ifdef CONFIG_ZONE_DEVICE +static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap) +{ + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->zone_device_data = NULL; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap + * because this is done early in section_activate() + */ + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } +} + void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, @@ -6600,39 +6640,7 @@ void __ref memmap_init_zone_device(struct zone *zone, for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone_idx, nid); - - /* - * Mark page reserved as it will need to wait for onlining - * phase for it to be fully associated with a zone. - * - * We can use the non-atomic __set_bit operation for setting - * the flag as we are still initializing the pages. - */ - __SetPageReserved(page); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer - * and zone_device_data. It is a bug if a ZONE_DEVICE page is - * ever freed or placed on a driver-private list. - */ - page->pgmap = pgmap; - page->zone_device_data = NULL; - - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - cond_resched(); - } + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); } pr_info("%s initialised %lu pages in %ums\n", __func__, -- cgit From c4386bd8ee3a921c3c799b7197dc898ade76a453 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:22 -0800 Subject: mm/memremap: add ZONE_DEVICE support for compound pages Add a new @vmemmap_shift property for struct dev_pagemap which specifies that a devmap is composed of a set of compound pages of order @vmemmap_shift, instead of base pages. When a compound page devmap is requested, all but the first page are initialised as tail pages instead of order-0 pages. For certain ZONE_DEVICE users like device-dax which have a fixed page size, this creates an opportunity to optimize GUP and GUP-fast walkers, treating it the same way as THP or hugetlb pages. Additionally, commit 7118fc2906e2 ("hugetlb: address ref count racing in prep_compound_gigantic_page") removed set_page_count() because the setting of page ref count to zero was redundant. devmap pages don't come from page allocator though and only head page refcount is used for compound pages, hence initialize tail page count to zero. Link: https://lkml.kernel.org/r/20211202204422.26777-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 11 +++++++++++ mm/memremap.c | 18 ++++++++++++------ mm/page_alloc.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c0e9d35889e8..61a6a0e27359 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -99,6 +99,11 @@ struct dev_pagemap_ops { * @done: completion for @internal_ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior + * @vmemmap_shift: structural definition of how the vmemmap page metadata + * is populated, specifically the metadata page order. + * A zero value (default) uses base pages as the vmemmap metadata + * representation. A bigger value will set up compound struct pages + * of the requested order value. * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no @@ -114,6 +119,7 @@ struct dev_pagemap { struct completion done; enum memory_type type; unsigned int flags; + unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; int nr_range; @@ -130,6 +136,11 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) return NULL; } +static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) +{ + return 1 << pgmap->vmemmap_shift; +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); diff --git a/mm/memremap.c b/mm/memremap.c index 5a66a71ab591..a2869d8519a2 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -102,15 +102,22 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(unsigned long pfn) +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) { - if (pfn % 1024 == 0) + if (pfn % (1024 << pgmap->vmemmap_shift)) cond_resched(); - return pfn + 1; + return pfn + pgmap_vmemmap_nr(pgmap); +} + +static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) +{ + return (pfn_end(pgmap, range_id) - + pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } #define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ + pfn = pfn_next(map, pfn)) static void dev_pagemap_kill(struct dev_pagemap *pgmap) { @@ -295,8 +302,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) - - pfn_first(pgmap, range_id)); + percpu_ref_get_many(pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23045a2a1339..d59023a676ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6612,6 +6612,35 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, } } +static void __ref memmap_init_compound(struct page *head, + unsigned long head_pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap, + unsigned long nr_pages) +{ + unsigned long pfn, end_pfn = head_pfn + nr_pages; + unsigned int order = pgmap->vmemmap_shift; + + __SetPageHead(head); + for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + prep_compound_tail(head, pfn - head_pfn); + set_page_count(page, 0); + + /* + * The first tail page stores compound_mapcount_ptr() and + * compound_order() and the second tail page stores + * compound_pincount_ptr(). Call prep_compound_head() after + * the first and second tail pages have been initialized to + * not have the data overwritten. + */ + if (pfn == head_pfn + 2) + prep_compound_head(head, order); + } +} + void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, @@ -6620,6 +6649,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long pfn, end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; struct vmem_altmap *altmap = pgmap_altmap(pgmap); + unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -6637,10 +6667,16 @@ void __ref memmap_init_zone_device(struct zone *zone, nr_pages = end_pfn - start_pfn; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { struct page *page = pfn_to_page(pfn); __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + + if (pfns_per_compound == 1) + continue; + + memmap_init_compound(page, pfn, zone_idx, nid, pgmap, + pfns_per_compound); } pr_info("%s initialised %lu pages in %ums\n", __func__, -- cgit From b9b5777f09be84d0de472ded2253d2f5101427f2 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:26 -0800 Subject: device-dax: use ALIGN() for determining pgoff Rather than calculating @pgoff manually, switch to ALIGN() instead. Link: https://lkml.kernel.org/r/20211202204422.26777-6-joao.m.martins@oracle.com Suggested-by: Dan Williams Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index dd8222a42808..0b82159b3564 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -234,8 +234,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, * mapped. No need to consider the zero page, or racing * conflicting mappings. */ - pgoff = linear_page_index(vmf->vma, vmf->address - & ~(fault_size - 1)); + pgoff = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); for (i = 0; i < fault_size / PAGE_SIZE; i++) { struct page *page; -- cgit From 09b80137033dbc5f1d197e99116527c0f8d253f2 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:29 -0800 Subject: device-dax: use struct_size() Use the struct_size() helper for the size of a struct with variable array member at the end, rather than manually calculating it. Link: https://lkml.kernel.org/r/20211202204422.26777-7-joao.m.martins@oracle.com Suggested-by: Dan Williams Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 0b82159b3564..038816b91af6 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -404,8 +404,9 @@ int dev_dax_probe(struct dev_dax *dev_dax) return -EINVAL; if (!pgmap) { - pgmap = devm_kzalloc(dev, sizeof(*pgmap) + sizeof(struct range) - * (dev_dax->nr_range - 1), GFP_KERNEL); + pgmap = devm_kzalloc(dev, + struct_size(pgmap, ranges, dev_dax->nr_range - 1), + GFP_KERNEL); if (!pgmap) return -ENOMEM; pgmap->nr_range = dev_dax->nr_range; -- cgit From fc65c4eb0b2a27c30d35636650e3f4ddb07506cd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:33 -0800 Subject: device-dax: ensure dev_dax->pgmap is valid for dynamic devices Right now, only static dax regions have a valid @pgmap pointer in its struct dev_dax. Dynamic dax case however, do not. In preparation for device-dax compound devmap support, make sure that dev_dax pgmap field is set after it has been allocated and initialized. dynamic dax device have the @pgmap is allocated at probe() and it's managed by devm (contrast to static dax region which a pgmap is provided and dax core kfrees it). So in addition to ensure a valid @pgmap, clear the pgmap when the dynamic dax device is released to avoid the same pgmap ranges to be re-requested across multiple region device reconfigs. Add a static_dev_dax() and use that helper in dev_dax_probe() to ensure the initialization differences between dynamic and static regions are more explicit. While at it, consolidate the ranges initialization when we allocate the @pgmap for the dynamic dax region case. Also take the opportunity to document the differences between static and dynamic da regions. Link: https://lkml.kernel.org/r/20211202204422.26777-8-joao.m.martins@oracle.com Suggested-by: Dan Williams Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/bus.c | 32 ++++++++++++++++++++++++++++++++ drivers/dax/bus.h | 1 + drivers/dax/device.c | 29 +++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 6cc4da4c713d..a22350e822fa 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -129,11 +129,35 @@ ATTRIBUTE_GROUPS(dax_drv); static int dax_bus_match(struct device *dev, struct device_driver *drv); +/* + * Static dax regions are regions created by an external subsystem + * nvdimm where a single range is assigned. Its boundaries are by the external + * subsystem and are usually limited to one physical memory range. For example, + * for PMEM it is usually defined by NVDIMM Namespace boundaries (i.e. a + * single contiguous range) + * + * On dynamic dax regions, the assigned region can be partitioned by dax core + * into multiple subdivisions. A subdivision is represented into one + * /dev/daxN.M device composed by one or more potentially discontiguous ranges. + * + * When allocating a dax region, drivers must set whether it's static + * (IORESOURCE_DAX_STATIC). On static dax devices, the @pgmap is pre-assigned + * to dax core when calling devm_create_dev_dax(), whereas in dynamic dax + * devices it is NULL but afterwards allocated by dax core on device ->probe(). + * Care is needed to make sure that dynamic dax devices are torn down with a + * cleared @pgmap field (see kill_dev_dax()). + */ static bool is_static(struct dax_region *dax_region) { return (dax_region->res.flags & IORESOURCE_DAX_STATIC) != 0; } +bool static_dev_dax(struct dev_dax *dev_dax) +{ + return is_static(dev_dax->region); +} +EXPORT_SYMBOL_GPL(static_dev_dax); + static u64 dev_dax_size(struct dev_dax *dev_dax) { u64 size = 0; @@ -363,6 +387,14 @@ void kill_dev_dax(struct dev_dax *dev_dax) kill_dax(dax_dev); unmap_mapping_range(inode->i_mapping, 0, 0, 1); + + /* + * Dynamic dax region have the pgmap allocated via dev_kzalloc() + * and thus freed by devm. Clear the pgmap to not have stale pgmap + * ranges on probe() from previous reconfigurations of region devices. + */ + if (!static_dev_dax(dev_dax)) + dev_dax->pgmap = NULL; } EXPORT_SYMBOL_GPL(kill_dev_dax); diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 1e946ad7780a..4acdfee7dd59 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -48,6 +48,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv, __dax_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) void dax_driver_unregister(struct dax_device_driver *dax_drv); void kill_dev_dax(struct dev_dax *dev_dax); +bool static_dev_dax(struct dev_dax *dev_dax); #if IS_ENABLED(CONFIG_DEV_DAX_PMEM_COMPAT) int dev_dax_probe(struct dev_dax *dev_dax); diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 038816b91af6..630de5a795b0 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -398,18 +398,34 @@ int dev_dax_probe(struct dev_dax *dev_dax) void *addr; int rc, i; - pgmap = dev_dax->pgmap; - if (dev_WARN_ONCE(dev, pgmap && dev_dax->nr_range > 1, - "static pgmap / multi-range device conflict\n")) - return -EINVAL; + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, + "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } + + pgmap = dev_dax->pgmap; + } else { + if (dev_dax->pgmap) { + dev_warn(dev, + "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } - if (!pgmap) { pgmap = devm_kzalloc(dev, struct_size(pgmap, ranges, dev_dax->nr_range - 1), GFP_KERNEL); if (!pgmap) return -ENOMEM; + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + pgmap->ranges[i] = *range; + } } for (i = 0; i < dev_dax->nr_range; i++) { @@ -421,9 +437,6 @@ int dev_dax_probe(struct dev_dax *dev_dax) i, range->start, range->end); return -EBUSY; } - /* don't update the range for static pgmap */ - if (!dev_dax->pgmap) - pgmap->ranges[i] = *range; } pgmap->type = MEMORY_DEVICE_GENERIC; -- cgit From a0fb038e50d72f8e60731dc48fb83a3a141b822e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:36 -0800 Subject: device-dax: factor out page mapping initialization Move initialization of page->mapping into a separate helper. This is in preparation to move the mapping set to be prior to inserting the page table entry and also for tidying up compound page handling into one helper. Link: https://lkml.kernel.org/r/20211202204422.26777-9-joao.m.martins@oracle.com Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 630de5a795b0..9c87927d4bc2 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -73,6 +73,27 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, return -1; } +static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, + unsigned long fault_size) +{ + unsigned long i, nr_pages = fault_size / PAGE_SIZE; + struct file *filp = vmf->vma->vm_file; + pgoff_t pgoff; + + pgoff = linear_page_index(vmf->vma, + ALIGN(vmf->address, fault_size)); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + + if (page->mapping) + continue; + + page->mapping = filp->f_mapping; + page->index = pgoff + i; + } +} + static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf, pfn_t *pfn) { @@ -224,28 +245,8 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, rc = VM_FAULT_SIGBUS; } - if (rc == VM_FAULT_NOPAGE) { - unsigned long i; - pgoff_t pgoff; - - /* - * In the device-dax case the only possibility for a - * VM_FAULT_NOPAGE result is when device-dax capacity is - * mapped. No need to consider the zero page, or racing - * conflicting mappings. - */ - pgoff = linear_page_index(vmf->vma, - ALIGN(vmf->address, fault_size)); - for (i = 0; i < fault_size / PAGE_SIZE; i++) { - struct page *page; - - page = pfn_to_page(pfn_t_to_pfn(pfn) + i); - if (page->mapping) - continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; - } - } + if (rc == VM_FAULT_NOPAGE) + dax_set_mapping(vmf, pfn, fault_size); dax_read_unlock(id); return rc; -- cgit From 0e7325f03f09802d1667b8860e10fe39c25bf14c Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:40 -0800 Subject: device-dax: set mapping prior to vmf_insert_pfn{,_pmd,pud}() Normally, the @page mapping is set prior to inserting the page into a page table entry. Make device-dax adhere to the same ordering, rather than setting mapping after the PTE is inserted. The address_space never changes and it is always associated with the same inode and underlying pages. So, the page mapping is set once but cleared when the struct pages are removed/freed (i.e. after {devm_}memunmap_pages()). Link: https://lkml.kernel.org/r/20211202204422.26777-10-joao.m.martins@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Joao Martins Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 9c87927d4bc2..19a6b86486ce 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -121,6 +121,8 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); } @@ -161,6 +163,8 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } @@ -203,6 +207,8 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + dax_set_mapping(vmf, *pfn, fault_size); + return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } #else @@ -217,7 +223,6 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { struct file *filp = vmf->vma->vm_file; - unsigned long fault_size; vm_fault_t rc = VM_FAULT_SIGBUS; int id; pfn_t pfn; @@ -230,23 +235,18 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, id = dax_read_lock(); switch (pe_size) { case PE_SIZE_PTE: - fault_size = PAGE_SIZE; rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); break; case PE_SIZE_PMD: - fault_size = PMD_SIZE; rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); break; case PE_SIZE_PUD: - fault_size = PUD_SIZE; rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); break; default: rc = VM_FAULT_SIGBUS; } - if (rc == VM_FAULT_NOPAGE) - dax_set_mapping(vmf, pfn, fault_size); dax_read_unlock(id); return rc; -- cgit From 6ec228b6fef5ad3a1f19e76c29640a9161415240 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:43 -0800 Subject: device-dax: remove pfn from __dev_dax_{pte,pmd,pud}_fault() After moving the page mapping to be set prior to pte insertion, the pfn in dev_dax_huge_fault() no longer is necessary. Remove it, as well as the @pfn argument passed to the internal fault handler helpers. [akpm@linux-foundation.org: fix CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=n build] Link: https://lkml.kernel.org/r/20211202204422.26777-11-joao.m.martins@oracle.com Signed-off-by: Joao Martins Suggested-by: Christoph Hellwig Cc: Dan Williams Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 19a6b86486ce..60d43cb195da 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -95,10 +95,11 @@ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, } static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { struct device *dev = &dev_dax->dev; phys_addr_t phys; + pfn_t pfn; unsigned int fault_size = PAGE_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) @@ -119,20 +120,21 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); - dax_set_mapping(vmf, *pfn, fault_size); + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_mixed(vmf->vma, vmf->address, *pfn); + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); } static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dev_dax->dev; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PMD_SIZE; if (check_vma(dev_dax, vmf->vma, __func__)) @@ -161,21 +163,22 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); - dax_set_mapping(vmf, *pfn, fault_size); + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { unsigned long pud_addr = vmf->address & PUD_MASK; struct device *dev = &dev_dax->dev; phys_addr_t phys; pgoff_t pgoff; + pfn_t pfn; unsigned int fault_size = PUD_SIZE; @@ -205,15 +208,15 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, return VM_FAULT_SIGBUS; } - *pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); + pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP); - dax_set_mapping(vmf, *pfn, fault_size); + dax_set_mapping(vmf, pfn, fault_size); - return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); } #else static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, - struct vm_fault *vmf, pfn_t *pfn) + struct vm_fault *vmf) { return VM_FAULT_FALLBACK; } @@ -225,7 +228,6 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, struct file *filp = vmf->vma->vm_file; vm_fault_t rc = VM_FAULT_SIGBUS; int id; - pfn_t pfn; struct dev_dax *dev_dax = filp->private_data; dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, @@ -235,13 +237,13 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, id = dax_read_lock(); switch (pe_size) { case PE_SIZE_PTE: - rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pte_fault(dev_dax, vmf); break; case PE_SIZE_PMD: - rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pmd_fault(dev_dax, vmf); break; case PE_SIZE_PUD: - rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn); + rc = __dev_dax_pud_fault(dev_dax, vmf); break; default: rc = VM_FAULT_SIGBUS; -- cgit From 14606001efb48a17be31a5bec626c13ca49d783a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:47 -0800 Subject: device-dax: compound devmap support Use the newly added compound devmap facility which maps the assigned dax ranges as compound pages at a page size of @align. dax devices are created with a fixed @align (huge page size) which is enforced through as well at mmap() of the device. Faults, consequently happen too at the specified @align specified at the creation, and those don't change throughout dax device lifetime. MCEs unmap a whole dax huge page, as well as splits occurring at the configured page size. Performance measured by gup_test improves considerably for unpin_user_pages() and altmap with NVDIMMs: $ gup_test -f /dev/dax1.0 -m 16384 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~71 ms -> put:~22 ms [altmap] (pin_user_pages_fast 2M pages) get:~524ms put:~525 ms -> get: ~127ms put:~71ms $ gup_test -f /dev/dax1.0 -m 129022 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~513 ms -> put:~188 ms [altmap with -m 127004] (pin_user_pages_fast 2M pages) get:~4.1 secs put:~4.12 secs -> get:~1sec put:~563ms .. as well as unpin_user_page_range_dirty_lock() being just as effective as THP/hugetlb[0] pages. [0] https://lore.kernel.org/linux-mm/20210212130843.13865-5-joao.m.martins@oracle.com/ Link: https://lkml.kernel.org/r/20211202204422.26777-12-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 60d43cb195da..591f293d326f 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -78,14 +78,20 @@ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, { unsigned long i, nr_pages = fault_size / PAGE_SIZE; struct file *filp = vmf->vma->vm_file; + struct dev_dax *dev_dax = filp->private_data; pgoff_t pgoff; + /* mapping is only set on the head */ + if (dev_dax->pgmap->vmemmap_shift) + nr_pages = 1; + pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size)); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + page = compound_head(page); if (page->mapping) continue; @@ -443,6 +449,9 @@ int dev_dax_probe(struct dev_dax *dev_dax) } pgmap->type = MEMORY_DEVICE_GENERIC; + if (dev_dax->align > PAGE_SIZE) + pgmap->vmemmap_shift = + order_base_2(dev_dax->align >> PAGE_SHIFT); addr = devm_memremap_pages(dev, pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); -- cgit From e5f4728767d2ec9e3eb122c74e224242d21ee650 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Jan 2022 14:04:51 -0800 Subject: kasan: test: add globals left-out-of-bounds test Add a test checking that KASAN generic can also detect out-of-bounds accesses to the left of globals. Unfortunately it seems that GCC doesn't catch this (tested GCC 10, 11). The main difference between GCC's globals redzoning and Clang's is that GCC relies on using increased alignment to producing padding, where Clang's redzoning implementation actually adds real data after the global and doesn't rely on alignment to produce padding. I believe this is the main reason why GCC can't reliably catch globals out-of-bounds in this case. Given this is now a known issue, to avoid failing the whole test suite, skip this test case with GCC. Link: https://lkml.kernel.org/r/20211117130714.135656-1-elver@google.com Signed-off-by: Marco Elver Reported-by: Kaiwan N Billimoria Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Kaiwan N Billimoria Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 0643573f8686..818e763b5b87 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -700,7 +700,7 @@ static void kmem_cache_bulk(struct kunit *test) static char global_array[10]; -static void kasan_global_oob(struct kunit *test) +static void kasan_global_oob_right(struct kunit *test) { /* * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS @@ -723,6 +723,20 @@ static void kasan_global_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } +static void kasan_global_oob_left(struct kunit *test) +{ + char *volatile array = global_array; + char *p = array - 3; + + /* + * GCC is known to fail this test, skip it. + * See https://bugzilla.kernel.org/show_bug.cgi?id=215051. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_CC_IS_CLANG); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + /* Check that ksize() makes the whole object accessible. */ static void ksize_unpoisons_memory(struct kunit *test) { @@ -1162,7 +1176,8 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmem_cache_oob), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), - KUNIT_CASE(kasan_global_oob), + KUNIT_CASE(kasan_global_oob_right), + KUNIT_CASE(kasan_global_oob_left), KUNIT_CASE(kasan_stack_oob), KUNIT_CASE(kasan_alloca_oob_left), KUNIT_CASE(kasan_alloca_oob_right), -- cgit From bed0a9b591492bb285ea88cd221e0412031396ca Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Jan 2022 14:04:54 -0800 Subject: kasan: add ability to detect double-kmem_cache_destroy() Because mm/slab_common.c is not instrumented with software KASAN modes, it is not possible to detect use-after-free of the kmem_cache passed into kmem_cache_destroy(). In particular, because of the s->refcount-- and subsequent early return if non-zero, KASAN would never be able to see the double-free via kmem_cache_free(kmem_cache, s). To be able to detect a double-kmem_cache_destroy(), check accessibility of the kmem_cache, and in case of failure return early. While KASAN_HW_TAGS is able to detect such bugs, by checking accessibility and returning early we fail more gracefully and also avoid corrupting reused objects (where tags mismatch). A recent case of a double-kmem_cache_destroy() was detected by KFENCE: https://lkml.kernel.org/r/0000000000003f654905c168b09d@google.com, which was not detectable by software KASAN modes. Link: https://lkml.kernel.org/r/20211119142219.1519617-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index b7c431819cdb..f02c32bd05ab 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -489,7 +489,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - if (unlikely(!s)) + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); -- cgit From f98f966cd75002a71caec1b6d209da5762c0efac Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Jan 2022 14:04:57 -0800 Subject: kasan: test: add test case for double-kmem_cache_destroy() Add a test case for double-kmem_cache_destroy() detection. Link: https://lkml.kernel.org/r/20211119142219.1519617-2-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 818e763b5b87..847cdbefab46 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -866,6 +866,16 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void kmem_cache_double_destroy(struct kunit *test) +{ + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", 200, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + kmem_cache_destroy(cache); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); +} + static void kasan_memchr(struct kunit *test) { char *ptr; @@ -1185,6 +1195,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(ksize_uaf), KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kasan_memchr), KUNIT_CASE(kasan_memcmp), KUNIT_CASE(kasan_strings), -- cgit From 26dca996ea7b1ac7008b6b6063fc88b849e3ac3e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 14 Jan 2022 14:05:01 -0800 Subject: kasan: fix quarantine conflicting with init_on_free KASAN's quarantine might save its metadata inside freed objects. As this happens after the memory is zeroed by the slab allocator when init_on_free is enabled, the memory coming out of quarantine is not properly zeroed. This causes lib/test_meminit.c tests to fail with Generic KASAN. Zero the metadata when the object is removed from quarantine. Link: https://lkml.kernel.org/r/2805da5df4b57138fdacd671f5d227d58950ba54.1640037083.git.andreyknvl@google.com Fixes: 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/quarantine.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index d8ccff4c1275..47ed4fc33a29 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -132,11 +132,22 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); unsigned long flags; if (IS_ENABLED(CONFIG_SLAB)) local_irq_save(flags); + /* + * If init_on_free is enabled and KASAN's free metadata is stored in + * the object, zero the metadata. Otherwise, the object's memory will + * not be properly zeroed, as KASAN saves the metadata after the slab + * allocator zeroes the object. + */ + if (slab_want_init_on_free(cache) && + cache->kasan_info.free_meta_offset == 0) + memzero_explicit(meta, sizeof(*meta)); + /* * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. -- cgit From 3e9d80a891df3b1a5d77db47fa7fdf33ba71e5cb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:05:04 -0800 Subject: mm,fs: split dump_mapping() out from dump_page() dump_mapping() is a big chunk of dump_page(), and it'd be handy to be able to call it when we don't have a struct page. Split it out and move it to fs/inode.c. Take the opportunity to simplify some of the debug messages a little. Link: https://lkml.kernel.org/r/20211121121056.2870061-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + mm/debug.c | 52 ++-------------------------------------------------- 3 files changed, 52 insertions(+), 50 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 6b80a51129d5..980e7b7a5460 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -526,6 +526,55 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); +void dump_mapping(const struct address_space *mapping) +{ + struct inode *host; + const struct address_space_operations *a_ops; + struct hlist_node *dentry_first; + struct dentry *dentry_ptr; + struct dentry dentry; + unsigned long ino; + + /* + * If mapping is an invalid pointer, we don't want to crash + * accessing it, so probe everything depending on it carefully. + */ + if (get_kernel_nofault(host, &mapping->host) || + get_kernel_nofault(a_ops, &mapping->a_ops)) { + pr_warn("invalid mapping:%px\n", mapping); + return; + } + + if (!host) { + pr_warn("aops:%ps\n", a_ops); + return; + } + + if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || + get_kernel_nofault(ino, &host->i_ino)) { + pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); + return; + } + + if (!dentry_first) { + pr_warn("aops:%ps ino:%lx\n", a_ops, ino); + return; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (get_kernel_nofault(dentry, dentry_ptr)) { + pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", + a_ops, ino, dentry_ptr); + return; + } + + /* + * if dentry is corrupted, the %pd handler may still crash, + * but it's unlikely that we reach here with a corrupt mapping + */ + pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); +} + void clear_inode(struct inode *inode) { /* diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..5315fa68f751 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3152,6 +3152,7 @@ extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); +void dump_mapping(const struct address_space *); /* * Userspace may rely on the the inode number being non-zero. For example, glibc diff --git a/mm/debug.c b/mm/debug.c index a05a39ff8fe4..bc9ac87f0e08 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -112,56 +112,8 @@ static void __dump_page(struct page *page) type = "ksm "; else if (PageAnon(page)) type = "anon "; - else if (mapping) { - struct inode *host; - const struct address_space_operations *a_ops; - struct hlist_node *dentry_first; - struct dentry *dentry_ptr; - struct dentry dentry; - unsigned long ino; - - /* - * mapping can be invalid pointer and we don't want to crash - * accessing it, so probe everything depending on it carefully - */ - if (get_kernel_nofault(host, &mapping->host) || - get_kernel_nofault(a_ops, &mapping->a_ops)) { - pr_warn("failed to read mapping contents, not a valid kernel address?\n"); - goto out_mapping; - } - - if (!host) { - pr_warn("aops:%ps\n", a_ops); - goto out_mapping; - } - - if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || - get_kernel_nofault(ino, &host->i_ino)) { - pr_warn("aops:%ps with invalid host inode %px\n", - a_ops, host); - goto out_mapping; - } - - if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, ino); - goto out_mapping; - } - - dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { - pr_warn("aops:%ps ino:%lx with invalid dentry %px\n", - a_ops, ino, dentry_ptr); - } else { - /* - * if dentry is corrupted, the %pd handler may still - * crash, but it's unlikely that we reach here with a - * corrupted struct page - */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", - a_ops, ino, &dentry); - } - } -out_mapping: + else if (mapping) + dump_mapping(mapping); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %pGp%s\n", type, &head->flags, -- cgit From 236476180c0f5d308fb313d5570d0b067307884c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 14 Jan 2022 14:05:07 -0800 Subject: mm/debug_vm_pgtable: update comments regarding migration swap entries Commit 4dd845b5a3e5 ("mm/swapops: rework swap entry manipulation code") had changed migtation entry related helpers. Just update debug_vm_pgatble() synced documentation to reflect those changes. Link: https://lkml.kernel.org/r/1641880417-24848-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/arch_pgtable_helpers.rst | 14 +++++++------- mm/debug_vm_pgtable.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst index 552567d863b8..b3166c33db39 100644 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ b/Documentation/vm/arch_pgtable_helpers.rst @@ -247,12 +247,12 @@ SWAP Page Table Helpers | __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | +---------------------------+--------------------------------------------------+ | is_migration_entry | Tests a migration (read or write) swapped entry | -+---------------------------+--------------------------------------------------+ -| is_write_migration_entry | Tests a write migration swapped entry | -+---------------------------+--------------------------------------------------+ -| make_migration_entry_read | Converts into read migration swapped entry | -+---------------------------+--------------------------------------------------+ -| make_migration_entry | Creates a migration swapped entry (read or write)| -+---------------------------+--------------------------------------------------+ ++-------------------------------+----------------------------------------------+ +| is_writable_migration_entry | Tests a write migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_readable_migration_entry | Creates a read migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_writable_migration_entry | Creates a write migration swapped entry | ++-------------------------------+----------------------------------------------+ [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 228e3954b90c..2a2b24e87877 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -888,8 +888,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) pr_debug("Validating swap migration\n"); /* - * make_migration_entry() expects given page to be - * locked, otherwise it stumbles upon a BUG_ON(). + * make_[readable|writable]_migration_entry() expects given page to + * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); swp = make_writable_migration_entry(page_to_pfn(page)); -- cgit From 43b93121056c524e2af77d561900ea856d32029c Mon Sep 17 00:00:00 2001 From: chiminghao Date: Fri, 14 Jan 2022 14:05:10 -0800 Subject: mm/truncate.c: remove unneeded variable Return value directly instead of taking this in another redundant variable. Link: https://lkml.kernel.org/r/20211207083222.401594-1-chi.minghao@zte.com.cn Signed-off-by: chiminghao Reported-by: Zeal Robot Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index cc83a3f7c1ad..41b8249b3b4a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -205,7 +205,6 @@ static void truncate_cleanup_page(struct page *page) static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - int ret; if (page->mapping != mapping) return 0; @@ -213,9 +212,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - ret = remove_mapping(mapping, page); - - return ret; + return remove_mapping(mapping, page); } int truncate_inode_page(struct address_space *mapping, struct page *page) -- cgit From 677b2a8c1f25db5b09c1ef5bf72faa39ea81d9cf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Jan 2022 14:05:13 -0800 Subject: gup: avoid multiple user access locking/unlocking in fault_in_{read/write}able fault_in_readable() and fault_in_writeable() perform __get_user() and __put_user() in a loop, implying multiple user access locking/unlocking. To avoid that, use user access blocks. Link: https://lkml.kernel.org/r/720dcf79314acca1a78fae56d478cc851952149d.1637084492.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 2c51e9748a6a..be2a41feec7d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1672,21 +1672,22 @@ size_t fault_in_writeable(char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_write_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__put_user(0, uaddr) != 0)) - return size; + unsafe_put_user(0, uaddr, out); uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__put_user(0, uaddr) != 0)) - goto out; + unsafe_put_user(0, uaddr, out); uaddr += PAGE_SIZE; } out: + user_write_access_end(); if (size > uaddr - start) return size - (uaddr - start); return 0; @@ -1771,21 +1772,22 @@ size_t fault_in_readable(const char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_read_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__get_user(c, uaddr) != 0)) - return size; + unsafe_get_user(c, uaddr, out); uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__get_user(c, uaddr) != 0)) - goto out; + unsafe_get_user(c, uaddr, out); uaddr += PAGE_SIZE; } out: + user_read_access_end(); (void)c; if (size > uaddr - start) return size - (uaddr - start); -- cgit From 28b0ee3fb35047bd2bac57cc5a051b26bbd9b194 Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Fri, 14 Jan 2022 14:05:16 -0800 Subject: mm/gup.c: stricter check on THP migration entry during follow_pmd_mask When BUG_ON check for THP migration entry, the existing code only check thp_migration_supported case, but not for !thp_migration_supported case. If !thp_migration_supported() and !pmd_present(), the original code may dead loop in theory. To make the BUG_ON check consistent, we need catch both cases. Move the BUG_ON check one step earlier, because if the bug happen we should know it instead of depend on FOLL_MIGRATION been used by caller. Because pmdval instead of *pmd is read by the is_pmd_migration_entry() check, the existing code don't help to avoid useless locking within pmd_migration_entry_wait(), so remove that check. Link: https://lkml.kernel.org/r/20211217062559.737063-1-lixinhai.lxh@gmail.com Signed-off-by: Li Xinhai Reviewed-by: "Huang, Ying" Reviewed-by: Miaohe Lin Cc: Zi Yan Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index be2a41feec7d..f0af462ac1e2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -642,12 +642,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } retry: if (!pmd_present(pmdval)) { + /* + * Should never reach here, if thp migration is not supported; + * Otherwise, it must be a thp migration entry. + */ + VM_BUG_ON(!thp_migration_supported() || + !is_pmd_migration_entry(pmdval)); + if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(pmdval)); - if (is_pmd_migration_entry(pmdval)) - pmd_migration_entry_wait(mm, pmd); + + pmd_migration_entry_wait(mm, pmd); pmdval = READ_ONCE(*pmd); /* * MADV_DONTNEED may convert the pmd to null because -- cgit From a7605426666196c5a460dd3de6f8dac1d3c21f00 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 14 Jan 2022 14:05:19 -0800 Subject: mm: shmem: don't truncate page if memory failure happens The current behavior of memory failure is to truncate the page cache regardless of dirty or clean. If the page is dirty the later access will get the obsolete data from disk without any notification to the users. This may cause silent data loss. It is even worse for shmem since shmem is in-memory filesystem, truncating page cache means discarding data blocks. The later read would return all zero. The right approach is to keep the corrupted page in page cache, any later access would return error for syscalls or SIGBUS for page fault, until the file is truncated, hole punched or removed. The regular storage backed filesystems would be more complicated so this patch is focused on shmem. This also unblock the support for soft offlining shmem THP. [akpm@linux-foundation.org: coding style fixes] [arnd@arndb.de: fix uninitialized variable use in me_pagecache_clean()] Link: https://lkml.kernel.org/r/20211022064748.4173718-1-arnd@kernel.org [Fix invalid pointer dereference in shmem_read_mapping_page_gfp() with a slight different implementation from what Ajay Garg and Muchun Song proposed and reworked the error handling of shmem_write_begin() suggested by Linus] Link: https://lore.kernel.org/linux-mm/20211111084617.6746-1-ajaygargnsit@gmail.com/ Link: https://lkml.kernel.org/r/20211020210755.23964-6-shy828301@gmail.com Link: https://lkml.kernel.org/r/20211116193247.21102-1-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Arnd Bergmann Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ajay Garg Cc: Muchun Song Cc: Andy Lavr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 14 +++++++++++--- mm/shmem.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------ mm/userfaultfd.c | 5 +++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3a274468f193..5f8ad5527506 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -867,6 +868,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -895,18 +897,24 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) goto out; } + /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..fb5152369926 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2457,6 +2457,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2467,7 +2468,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (ret) + return ret; + + if (PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + *pagep = NULL; + return -EIO; + } + + return 0; } static int @@ -2554,6 +2567,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -3093,7 +3112,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3101,6 +3121,13 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (!page) + return ERR_PTR(-ECHILD); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3751,6 +3778,13 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@ -3761,7 +3795,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -4169,9 +4203,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; #else /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ac6f036298cd..0780c2a57ff1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -232,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, goto out; } + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret) -- cgit From 62c9827cbb996c2c04f615ecd783ce28bcea894b Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 14 Jan 2022 14:05:23 -0800 Subject: shmem: fix a race between shmem_unused_huge_shrink and shmem_evict_inode Fix a data race in commit 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure"). Here are call traces causing race: Call Trace 1: shmem_unused_huge_shrink+0x3ae/0x410 ? __list_lru_walk_one.isra.5+0x33/0x160 super_cache_scan+0x17c/0x190 shrink_slab.part.55+0x1ef/0x3f0 shrink_node+0x10e/0x330 kswapd+0x380/0x740 kthread+0xfc/0x130 ? mem_cgroup_shrink_node+0x170/0x170 ? kthread_create_on_node+0x70/0x70 ret_from_fork+0x1f/0x30 Call Trace 2: shmem_evict_inode+0xd8/0x190 evict+0xbe/0x1c0 do_unlinkat+0x137/0x330 do_syscall_64+0x76/0x120 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 A simple explanation: Image there are 3 items in the local list (@list). In the first traversal, A is not deleted from @list. 1) A->B->C ^ | pos (leave) In the second traversal, B is deleted from @list. Concurrently, A is deleted from @list through shmem_evict_inode() since last reference counter of inode is dropped by other thread. Then the @list is corrupted. 2) A->B->C ^ ^ | | evict pos (drop) We should make sure the inode is either on the global list or deleted from any local list before iput(). Fixed by moving inodes back to global list before we put them. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20211125064502.99983-1-ligang.bdlg@bytedance.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Gang Li Reviewed-by: Muchun Song Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index fb5152369926..8f940552c182 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -554,7 +554,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -577,12 +576,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -602,7 +601,7 @@ next: inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -616,38 +615,44 @@ next: } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } -- cgit From 3795f46b83c66a2e4545460dec74c80b839faafe Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 14 Jan 2022 14:05:26 -0800 Subject: mm/frontswap.c: use non-atomic '__set_bit()' when possible The 'a' and 'b' bitmaps are local to this function, so no concurrent access can occur. So the non-atomic '__set_bit()' can be used to save a few cycles. Link: https://lkml.kernel.org/r/e52476da5cee57151745c5c3c934a69798dc6fa4.1638132190.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index 130e301c5ac0..6bed12260dea 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -127,7 +127,7 @@ void frontswap_register_ops(struct frontswap_ops *ops) spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (!WARN_ON(!si->frontswap_map)) - set_bit(si->type, a); + __set_bit(si->type, a); } spin_unlock(&swap_lock); @@ -149,7 +149,7 @@ void frontswap_register_ops(struct frontswap_ops *ops) spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (si->frontswap_map) - set_bit(si->type, b); + __set_bit(si->type, b); } spin_unlock(&swap_lock); -- cgit From 17c17367758059930246dde937cc7da9b8f3549e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 14 Jan 2022 14:05:29 -0800 Subject: mm: memcontrol: make cgroup_memory_nokmem static Commit 494c1dfe855e ("mm: memcg/slab: create a new set of kmalloc-cg- caches") makes cgroup_memory_nokmem global, however, it is unnecessary because there is already a function mem_cgroup_kmem_disabled() which exports it. Just make it static and replace it with mem_cgroup_kmem_disabled() in mm/slab_common.c. Link: https://lkml.kernel.org/r/20211109065418.21693-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Chris Down Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ----- mm/memcontrol.c | 2 +- mm/slab_common.c | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 3b79a5c9427a..bdac62e1eca7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -157,11 +157,6 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason */ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); -/* - * in mm/memcontrol.c: - */ -extern bool cgroup_memory_nokmem; - /* * in mm/page_alloc.c */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3542237c833f..bfe9bdec192b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -84,7 +84,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP diff --git a/mm/slab_common.c b/mm/slab_common.c index f02c32bd05ab..1f75bd4e95d6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -844,7 +844,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) if (type == KMALLOC_RECLAIM) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { - if (cgroup_memory_nokmem) { + if (mem_cgroup_kmem_disabled()) { kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; return; } -- cgit From 46a53371f3fd9bf873fdd9c4df75b1cd86df1098 Mon Sep 17 00:00:00 2001 From: Donghai Qiao Date: Fri, 14 Jan 2022 14:05:32 -0800 Subject: mm/page_counter: remove an incorrect call to propagate_protected_usage() propagate_protected_usage() is called to propagate the usage change in the page_counter structure. But there is a call to this function from page_counter_try_charge() when there is actually no usage change. Hence this call should be removed. Link: https://lkml.kernel.org/r/20211118181125.3918222-1-dqiao@redhat.com Signed-off-by: Donghai Qiao Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_counter.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_counter.c b/mm/page_counter.c index 7d83641eb86b..eb156ff5d603 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -120,7 +120,6 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_protected_usage(c, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used -- cgit From b6bf9abb0aa44e53ffe9c1e6e1d32568f5b25e4a Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Fri, 14 Jan 2022 14:05:35 -0800 Subject: mm/memcg: add oom_group_kill memory event Our container agent wants to know when a container exits if it was OOM killed or not to report to the user. We use memory.oom.group = 1 to ensure that OOM kills within the container's cgroup kill everything. Existing memory.events are insufficient for knowing if this triggered: 1) Our current approach reads memory.events oom_kill and reports the container was killed if the value is non-zero. This is erroneous in some cases where containers create their children cgroups with memory.oom.group=1 as such OOM kills will get counted against the container cgroup's oom_kill counter despite not actually OOM killing the entire container. 2) Reading memory.events.local will fail to identify OOM kills in leaf cgroups (that don't set memory.oom.group) within the container cgroup. This patch adds a new oom_group_kill event when memory.oom.group triggers to allow userspace to cleanly identify when an entire cgroup is oom killed. [schatzberg.dan@gmail.com: changes from Johannes and Chris] Link: https://lkml.kernel.org/r/20211213162511.2492267-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20211203162426.3375036-1-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Chris Down Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Tejun Heo Cc: Zefan Li Cc: Jonathan Corbet Cc: Vladimir Davydov Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 3 +++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 2 ++ mm/oom_kill.c | 1 + 4 files changed, 7 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 2aeb7ae8b393..8269bfa240f4 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1268,6 +1268,9 @@ PAGE_SIZE multiple when read back. The number of processes belonging to this cgroup killed by any kind of OOM killer. + oom_group_kill + The number of times a group OOM has occurred. + memory.events.local Similar to memory.events but the fields in the file are local to the cgroup i.e. not hierarchical. The file modified event diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c5c403f4be6..951f24f42147 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -42,6 +42,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bfe9bdec192b..2d39d58baccf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6318,6 +6318,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_KILL])); + seq_printf(m, "oom_group_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); } static int memory_events_show(struct seq_file *m, void *v) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ddabefcfb5a..e52ce0b1465d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -994,6 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * If necessary, kill all tasks in the selected memory cgroup. */ if (oom_group) { + memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL); mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, (void *)message); -- cgit From 5b3be698a872c490dbed524f3e2463701ab21339 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 14 Jan 2022 14:05:39 -0800 Subject: memcg: better bounds on the memcg stats updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 11192d9c124d ("memcg: flush stats only if updated") added tracking of memcg stats updates which is used by the readers to flush only if the updates are over a certain threshold. However each individual update can correspond to a large value change for a given stat. For example adding or removing a hugepage to an LRU changes the stat by thp_nr_pages (512 on x86_64). Treating the update related to THP as one can keep the stat off, in theory, by (thp_nr_pages * nr_cpus * CHARGE_BATCH) before flush. To handle such scenarios, this patch adds consideration of the stat update value as well instead of just the update event. In addition let the asyn flusher unconditionally flush the stats to put time limit on the stats skew and hopefully a lot less readers would need to flush. Link: https://lkml.kernel.org/r/20211118065350.697046-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: "Michal Koutný" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d39d58baccf..aa2a15298636 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -629,11 +629,17 @@ static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); -static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + unsigned int x; + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); - if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) - atomic_inc(&stats_flush_threshold); + + x = __this_cpu_add_return(stats_updates, abs(val)); + if (x > MEMCG_CHARGE_BATCH) { + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + __this_cpu_write(stats_updates, 0); + } } static void __mem_cgroup_flush_stats(void) @@ -656,7 +662,7 @@ void mem_cgroup_flush_stats(void) static void flush_memcg_stats_dwork(struct work_struct *w) { - mem_cgroup_flush_stats(); + __mem_cgroup_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); } @@ -672,7 +678,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -705,7 +711,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); } /** @@ -789,7 +795,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, count); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) -- cgit From 06b2c3b08ce134c9555d91a1cf15cd03646cc287 Mon Sep 17 00:00:00 2001 From: Wang Weiyang Date: Fri, 14 Jan 2022 14:05:42 -0800 Subject: mm/memcg: use struct_size() helper in kzalloc() Make use of the struct_size() helper instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows. Link: https://github.com/KSPP/linux/issues/160 Link: https://lkml.kernel.org/r/20211216022024.127375-1-wangweiyang2@huawei.com Signed-off-by: Wang Weiyang Reviewed-by: Muchun Song Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index aa2a15298636..88e1be912aa7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5122,15 +5122,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - unsigned int size; int node; int __maybe_unused i; long error = -ENOMEM; - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); + memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) return ERR_PTR(error); -- cgit From 4e5aa1f4c2b489bc6f3ab5ca54747b18a847289d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 14 Jan 2022 14:05:45 -0800 Subject: memcg: add per-memcg vmalloc stat The kvmalloc* allocation functions can fallback to vmalloc allocations and more often on long running machines. In addition the kernel does have __GFP_ACCOUNT kvmalloc* calls. So, often on long running machines, the memory.stat does not tell the complete picture which type of memory is charged to the memcg. So add a per-memcg vmalloc stat. [shakeelb@google.com: page_memcg() within rcu lock, per Muchun] Link: https://lkml.kernel.org/r/20211222052457.1960701-1-shakeelb@google.com [akpm@linux-foundation.org: remove cast, per Muchun] [shakeelb@google.com: remove area->page[0] checks and move to page by page accounting per Michal] Link: https://lkml.kernel.org/r/20220104222341.3972772-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20211221215336.1922823-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 3 +++ include/linux/memcontrol.h | 21 +++++++++++++++++++++ mm/memcontrol.c | 1 + mm/vmalloc.c | 13 +++++++++++-- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8269bfa240f4..4f400b03dddf 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1314,6 +1314,9 @@ PAGE_SIZE multiple when read back. sock (npn) Amount of memory used in network transmission buffers + vmalloc (npn) + Amount of memory used for vmap backed memory. + shmem Amount of cached filesystem data that is swap-backed, such as tmpfs, shm segments, shared anonymous mmap()s diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 951f24f42147..0131e5574c88 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -33,6 +33,7 @@ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, + MEMCG_VMALLOC, MEMCG_NR_STAT, }; @@ -992,6 +993,21 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = page_memcg(page); + if (memcg) + mod_memcg_state(memcg, idx, val); + rcu_read_unlock(); +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return READ_ONCE(memcg->vmstats.state[idx]); @@ -1447,6 +1463,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 88e1be912aa7..c9ddd02dc5de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1375,6 +1375,7 @@ static const struct memory_stat memory_stats[] = { { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, + { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bf3c2fe8f528..80c6de4c425f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -2623,12 +2624,13 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { unsigned int page_order = vm_area_page_order(area); - int i; + int i, step = 1U << page_order; - for (i = 0; i < area->nr_pages; i += 1U << page_order) { + for (i = 0; i < area->nr_pages; i += step) { struct page *page = area->pages[i]; BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -step); __free_pages(page, page_order); cond_resched(); } @@ -2955,6 +2957,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + int i, step = 1U << page_order; + + for (i = 0; i < area->nr_pages; i += step) + mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, + step); + } /* * If not enough pages were obtained to accomplish an -- cgit From 2c769ed7137a75a8297936fb54e6ff1f56d3d0f1 Mon Sep 17 00:00:00 2001 From: chiminghao Date: Fri, 14 Jan 2022 14:05:48 -0800 Subject: tools/testing/selftests/vm/userfaultfd.c: use swap() to make code cleaner Fix the following coccicheck REVIEW: tools/testing/selftests/vm/userfaultfd.c:1531:21-22:use swap() to make code cleaner Link: https://lkml.kernel.org/r/20211124031632.35317-1-chi.minghao@zte.com.cn Signed-off-by: chiminghao Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 9354a5e0321c..990f7aecf4a3 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -1417,7 +1417,6 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize) static int userfaultfd_stress(void) { void *area; - char *tmp_area; unsigned long nr; struct uffdio_register uffdio_register; struct uffd_stats uffd_stats[nr_cpus]; @@ -1528,13 +1527,9 @@ static int userfaultfd_stress(void) count_verify[nr], nr); /* prepare next bounce */ - tmp_area = area_src; - area_src = area_dst; - area_dst = tmp_area; + swap(area_src, area_dst); - tmp_area = area_src_alias; - area_src_alias = area_dst_alias; - area_dst_alias = tmp_area; + swap(area_src_alias, area_dst_alias); uffd_stats_report(uffd_stats, nr_cpus); } -- cgit From 36ef159f4408b08eae7f2af6d62bedd3f4343758 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 14 Jan 2022 14:05:51 -0800 Subject: mm: remove redundant check about FAULT_FLAG_ALLOW_RETRY bit Since commit 4064b9827063 ("mm: allow VM_FAULT_RETRY for multiple times") allowed VM_FAULT_RETRY for multiple times, the FAULT_FLAG_ALLOW_RETRY bit of fault_flag will not be changed in the page fault path, so the following check is no longer needed: flags & FAULT_FLAG_ALLOW_RETRY So just remove it. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20211110123358.36511-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Peter Zijlstra Cc: Ingo Molnar Cc: David Hildenbrand Cc: Kirill Shutemov Cc: Peter Xu Cc: Muchun Song Cc: Chengming Zhou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 16 +++++++--------- arch/arc/mm/fault.c | 3 +-- arch/arm/mm/fault.c | 2 +- arch/arm64/mm/fault.c | 6 ++---- arch/hexagon/mm/vm_fault.c | 8 +++----- arch/ia64/mm/fault.c | 16 +++++++--------- arch/m68k/mm/fault.c | 22 ++++++++++------------ arch/microblaze/mm/fault.c | 22 ++++++++++------------ arch/mips/mm/fault.c | 19 +++++++++---------- arch/nds32/mm/fault.c | 18 ++++++++---------- arch/nios2/mm/fault.c | 18 ++++++++---------- arch/openrisc/mm/fault.c | 18 ++++++++---------- arch/parisc/mm/fault.c | 18 ++++++++---------- arch/powerpc/mm/fault.c | 6 ++---- arch/riscv/mm/fault.c | 2 +- arch/s390/mm/fault.c | 28 ++++++++++++++-------------- arch/sh/mm/fault.c | 20 +++++++++----------- arch/sparc/mm/fault_32.c | 16 +++++++--------- arch/sparc/mm/fault_64.c | 16 +++++++--------- arch/um/kernel/trap.c | 8 +++----- arch/x86/mm/fault.c | 3 +-- arch/xtensa/mm/fault.c | 17 ++++++++--------- 22 files changed, 134 insertions(+), 168 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index eee5102c3d88..6c0a277388dd 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -165,17 +165,15 @@ retry: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - /* No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + /* No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ - goto retry; - } + goto retry; } mmap_read_unlock(mm); diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 5787c261c9a4..dad27e4d69ff 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -149,8 +149,7 @@ retry: /* * Fault retry nuances, mmap_lock already relinquished by core mm */ - if (unlikely((fault & VM_FAULT_RETRY) && - (flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index bc8779d54a64..c7326a521a69 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -312,7 +312,7 @@ retry: return 0; } - if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { + if (!(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; goto retry; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9ae24e3b72be..a8fb54fccde0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -606,10 +606,8 @@ retry: } if (fault & VM_FAULT_RETRY) { - if (mm_flags & FAULT_FLAG_ALLOW_RETRY) { - mm_flags |= FAULT_FLAG_TRIED; - goto retry; - } + mm_flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(mm); diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index ef32c5a84ff3..4fac4b9eb316 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -98,11 +98,9 @@ good_area: /* The most common case -- we are done. */ if (likely(!(fault & VM_FAULT_ERROR))) { - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - goto retry; - } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(mm); diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 02de2e70c587..32417f49ad2f 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -156,17 +156,15 @@ retry: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - /* No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + /* No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ - goto retry; - } + goto retry; } mmap_read_unlock(mm); diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index ef46e77e97a5..53cfb9bc1066 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -153,18 +153,16 @@ good_area: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - - goto retry; - } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + + goto retry; } mmap_read_unlock(mm); diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index b3fed2cecf84..a9626e6a68af 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -232,18 +232,16 @@ good_area: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - - goto retry; - } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + + goto retry; } mmap_read_unlock(mm); diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index e7abda9c013f..44f98100e84e 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -171,18 +171,17 @@ good_area: goto do_sigbus; BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - goto retry; - } + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + + goto retry; } mmap_read_unlock(mm); diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index 1d139b117168..636977a1c8b9 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -230,16 +230,14 @@ good_area: goto bad_area; } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - - /* No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - goto retry; - } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + + /* No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + goto retry; } mmap_read_unlock(mm); diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index 9476feecf512..a32f14cd72f2 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -149,18 +149,16 @@ good_area: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ - goto retry; - } + goto retry; } mmap_read_unlock(mm); diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index f0fa6394a58e..80bb66ad42f6 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -177,18 +177,16 @@ good_area: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - /*RGD modeled on Cris */ - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + /*RGD modeled on Cris */ + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - /* No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + /* No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ - goto retry; - } + goto retry; } mmap_read_unlock(mm); diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 4a6221b869fd..360b627645cc 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -324,16 +324,14 @@ good_area: goto bad_area; BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - flags |= FAULT_FLAG_TRIED; - goto retry; - } + if (fault & VM_FAULT_RETRY) { + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(mm); return; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index a8d0ce85d39a..ebcc61e47d62 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -516,10 +516,8 @@ retry: * case. */ if (unlikely(fault & VM_FAULT_RETRY)) { - if (flags & FAULT_FLAG_ALLOW_RETRY) { - flags |= FAULT_FLAG_TRIED; - goto retry; - } + flags |= FAULT_FLAG_TRIED; + goto retry; } mmap_read_unlock(current->mm); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index aa08dd2f8fae..cae4b6363607 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -330,7 +330,7 @@ good_area: if (fault_signal_pending(fault, regs)) return; - if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; /* diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d30f5986fa85..d7d6be283d94 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -452,21 +452,21 @@ retry: if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* FAULT_FLAG_RETRY_NOWAIT has been set, - * mmap_lock has not been released */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - flags &= ~FAULT_FLAG_RETRY_NOWAIT; - flags |= FAULT_FLAG_TRIED; - mmap_read_lock(mm); - goto retry; + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* + * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has + * not been released + */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; } + flags &= ~FAULT_FLAG_RETRY_NOWAIT; + flags |= FAULT_FLAG_TRIED; + mmap_read_lock(mm); + goto retry; } if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 1e1aa75df3ca..e175667b1363 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -485,17 +485,15 @@ good_area: if (mm_fault_error(regs, error_code, address, fault)) return; - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - - /* - * No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ - goto retry; - } + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; + + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + goto retry; } mmap_read_unlock(mm); diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 90dc4ae315c8..ad569d9bd124 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -200,17 +200,15 @@ good_area: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - /* No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + /* No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ - goto retry; - } + goto retry; } mmap_read_unlock(mm); diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 9a9652a15fed..253e07043298 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -437,17 +437,15 @@ good_area: BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - /* No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + /* No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ - goto retry; - } + goto retry; } mmap_read_unlock(mm); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index c32efb09db21..193503484af5 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -87,12 +87,10 @@ good_area: } BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - goto retry; - } + goto retry; } pmd = pmd_off(mm, address); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4bfed53e210e..d0074c6ed31a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1413,8 +1413,7 @@ good_area: * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ - if (unlikely((fault & VM_FAULT_RETRY) && - (flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index fd6a70635962..06d0973a0d74 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -127,17 +127,16 @@ good_area: goto do_sigbus; BUG(); } - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - flags |= FAULT_FLAG_TRIED; - /* No need to mmap_read_unlock(mm) as we would - * have already released it in __lock_page_or_retry - * in mm/filemap.c. - */ + if (fault & VM_FAULT_RETRY) { + flags |= FAULT_FLAG_TRIED; - goto retry; - } + /* No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + + goto retry; } mmap_read_unlock(mm); -- cgit From ac1e9acc5acf0b41d54de6a4c45471644f8b97ff Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:55 -0800 Subject: mm: rearrange madvise code to allow for reuse Patch series "mm: rearrange madvise code to allow for reuse", v11. Avoid performance regression of the new anon vma name field refcounting it. I checked the image sizes with allnoconfig builds: unpatched Linus' ToT text data bss dec hex filename 1324759 32 73928 1398719 1557bf vmlinux After the first patch is applied (madvise refactoring) text data bss dec hex filename 1322346 32 73928 1396306 154e52 vmlinux >>> 2413 bytes decrease vs ToT <<< After all patches applied with CONFIG_ANON_VMA_NAME=n text data bss dec hex filename 1322337 32 73928 1396297 154e49 vmlinux >>> 2422 bytes decrease vs ToT <<< After all patches applied with CONFIG_ANON_VMA_NAME=y text data bss dec hex filename 1325228 32 73928 1399188 155994 vmlinux >>> 469 bytes increase vs ToT <<< This patch (of 3): Refactor the madvise syscall to allow for parts of it to be reused by a prctl syscall that affects vmas. Move the code that walks vmas in a virtual address range into a function that takes a function pointer as a parameter. The only caller for now is sys_madvise, which uses it to call madvise_vma_behavior on each vma, but the next patch will add an additional caller. Move handling all vma behaviors inside madvise_behavior, and rename it to madvise_vma_behavior. Move the code that updates the flags on a vma, including splitting or merging the vma as necessary, into a new function called madvise_update_vma. The next patch will add support for updating a new anon_name field as well. Link: https://lkml.kernel.org/r/20211019215511.3771969-1-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Cc: Pekka Enberg Cc: Dave Hansen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Jan Glauber Cc: John Stultz Cc: Rob Landley Cc: Cyrill Gorcunov Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: David Rientjes Cc: Al Viro Cc: Hugh Dickins Cc: Mel Gorman Cc: Shaohua Li Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 338 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 178 insertions(+), 160 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 8c927202bbe6..4b9c5509990c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior) } /* - * We can potentially split a vm area into separate - * areas, each area with its own behavior. + * Update the vm_flags on region of a vma, splitting it or merging it as + * necessary. Must be called with mmap_sem held for writing; */ -static long madvise_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +static int madvise_update_vma(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long new_flags) { struct mm_struct *mm = vma->vm_mm; - int error = 0; + int error; pgoff_t pgoff; - unsigned long new_flags = vma->vm_flags; - - switch (behavior) { - case MADV_NORMAL: - new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; - break; - case MADV_SEQUENTIAL: - new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; - break; - case MADV_RANDOM: - new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; - break; - case MADV_DONTFORK: - new_flags |= VM_DONTCOPY; - break; - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTCOPY; - break; - case MADV_WIPEONFORK: - /* MADV_WIPEONFORK is only supported on anonymous memory. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) { - error = -EINVAL; - goto out; - } - new_flags |= VM_WIPEONFORK; - break; - case MADV_KEEPONFORK: - new_flags &= ~VM_WIPEONFORK; - break; - case MADV_DONTDUMP: - new_flags |= VM_DONTDUMP; - break; - case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTDUMP; - break; - case MADV_MERGEABLE: - case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) - goto out_convert_errno; - break; - case MADV_HUGEPAGE: - case MADV_NOHUGEPAGE: - error = hugepage_madvise(vma, &new_flags, behavior); - if (error) - goto out_convert_errno; - break; - } if (new_flags == vma->vm_flags) { *prev = vma; - goto out; + return 0; } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -147,23 +91,19 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, start, 1); if (error) - goto out_convert_errno; + return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, end, 0); if (error) - goto out_convert_errno; + return error; } success: @@ -172,15 +112,7 @@ success: */ vma->vm_flags = new_flags; -out_convert_errno: - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; -out: - return error; + return 0; } #ifdef CONFIG_SWAP @@ -930,6 +862,94 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +/* + * Apply an madvise behavior to a region of a vma. madvise_update_vma + * will handle splitting a vm area into separate areas, each area with its own + * behavior. + */ +static int madvise_vma_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long behavior) +{ + int error; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); + case MADV_FREE: + case MADV_DONTNEED: + return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) + return -EINVAL; + new_flags &= ~VM_DONTCOPY; + break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return -EINVAL; + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + return -EINVAL; + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + error = madvise_update_vma(vma, prev, start, end, new_flags); + +out: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. @@ -978,30 +998,6 @@ static int madvise_inject_error(int behavior, } #endif -static long -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); - case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); - case MADV_COLD: - return madvise_cold(vma, prev, start, end); - case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); - case MADV_FREE: - case MADV_DONTNEED: - return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); - default: - return madvise_behavior(vma, prev, start, end, behavior); - } -} - static bool madvise_behavior_valid(int behavior) { @@ -1055,6 +1051,73 @@ process_madvise_behavior_valid(int behavior) } } +/* + * Walk the vmas in range [start,end), and call the visit function on each one. + * The visit function will get start and end parameters that cover the overlap + * between the current vma and the original range. Any unmapped regions in the + * original range will result in this function returning -ENOMEM while still + * calling the visit function on all of the existing vmas in the range. + * Must be called with the mmap_lock held for reading or writing. + */ +static +int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long arg, + int (*visit)(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long arg)) +{ + struct vm_area_struct *vma; + struct vm_area_struct *prev; + unsigned long tmp; + int unmapped_error = 0; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + int error; + + /* Still start < end. */ + if (!vma) + return -ENOMEM; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + break; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = visit(vma, &prev, start, tmp, arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + if (start >= end) + break; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_lock */ + vma = find_vma(mm, start); + } + + return unmapped_error; +} + /* * The madvise(2) system call. * @@ -1127,10 +1190,8 @@ process_madvise_behavior_valid(int behavior) */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end, tmp; - struct vm_area_struct *vma, *prev; - int unmapped_error = 0; - int error = -EINVAL; + unsigned long end; + int error; int write; size_t len; struct blk_plug plug; @@ -1138,23 +1199,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr(start); if (!madvise_behavior_valid(behavior)) - return error; + return -EINVAL; if (!PAGE_ALIGNED(start)) - return error; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return error; + return -EINVAL; end = start + len; if (end < start) - return error; + return -EINVAL; - error = 0; if (end == start) - return error; + return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -1169,51 +1229,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh mmap_read_lock(mm); } - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. - */ - vma = find_vma_prev(mm, start, &prev); - if (vma && start > vma->vm_start) - prev = vma; - blk_start_plug(&plug); - for (;;) { - /* Still start < end. */ - error = -ENOMEM; - if (!vma) - goto out; - - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) - goto out; - } - - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; - - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, tmp, behavior); - if (error) - goto out; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - error = unmapped_error; - if (start >= end) - goto out; - if (prev) - vma = prev->vm_next; - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, start); - } -out: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); -- cgit From 9a10064f5625d5572c3626c1516e0bebc6c9fe9b Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:59 -0800 Subject: mm: add a field to store names for private anonymous memory In many userspace applications, and especially in VM based applications like Android uses heavily, there are multiple different allocators in use. At a minimum there is libc malloc and the stack, and in many cases there are libc malloc, the stack, direct syscalls to mmap anonymous memory, and multiple VM heaps (one for small objects, one for big objects, etc.). Each of these layers usually has its own tools to inspect its usage; malloc by compiling a debug version, the VM through heap inspection tools, and for direct syscalls there is usually no way to track them. On Android we heavily use a set of tools that use an extended version of the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped in userspace and slice their usage by process, shared (COW) vs. unique mappings, backing, etc. This can account for real physical memory usage even in cases like fork without exec (which Android uses heavily to share as many private COW pages as possible between processes), Kernel SamePage Merging, and clean zero pages. It produces a measurement of the pages that only exist in that process (USS, for unique), and a measurement of the physical memory usage of that process with the cost of shared pages being evenly split between processes that share them (PSS). If all anonymous memory is indistinguishable then figuring out the real physical memory usage (PSS) of each heap requires either a pagemap walking tool that can understand the heap debugging of every layer, or for every layer's heap debugging tools to implement the pagemap walking logic, in which case it is hard to get a consistent view of memory across the whole system. Tracking the information in userspace leads to all sorts of problems. It either needs to be stored inside the process, which means every process has to have an API to export its current heap information upon request, or it has to be stored externally in a filesystem that somebody needs to clean up on crashes. It needs to be readable while the process is still running, so it has to have some sort of synchronization with every layer of userspace. Efficiently tracking the ranges requires reimplementing something like the kernel vma trees, and linking to it from every layer of userspace. It requires more memory, more syscalls, more runtime cost, and more complexity to separately track regions that the kernel is already tracking. This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a userspace-provided name for anonymous vmas. The names of named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:]. Userspace can set the name for a region of memory by calling prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name) Setting the name to NULL clears it. The name length limit is 80 bytes including NUL-terminator and is checked to contain only printable ascii characters (including space), except '[',']','\','$' and '`'. Ascii strings are being used to have a descriptive identifiers for vmas, which can be understood by the users reading /proc/pid/maps or /proc/pid/smaps. Names can be standardized for a given system and they can include some variable parts such as the name of the allocator or a library, tid of the thread using it, etc. The name is stored in a pointer in the shared union in vm_area_struct that points to a null terminated string. Anonymous vmas with the same name (equivalent strings) and are otherwise mergeable will be merged. The name pointers are not shared between vmas even if they contain the same name. The name pointer is stored in a union with fields that are only used on file-backed mappings, so it does not increase memory usage. CONFIG_ANON_VMA_NAME kernel configuration is introduced to enable this feature. It keeps the feature disabled by default to prevent any additional memory overhead and to avoid confusing procfs parsers on systems which are not ready to support named anonymous vmas. The patch is based on the original patch developed by Colin Cross, more specifically on its latest version [1] posted upstream by Sumit Semwal. It used a userspace pointer to store vma names. In that design, name pointers could be shared between vmas. However during the last upstreaming attempt, Kees Cook raised concerns [2] about this approach and suggested to copy the name into kernel memory space, perform validity checks [3] and store as a string referenced from vm_area_struct. One big concern is about fork() performance which would need to strdup anonymous vma names. Dave Hansen suggested experimenting with worst-case scenario of forking a process with 64k vmas having longest possible names [4]. I ran this experiment on an ARM64 Android device and recorded a worst-case regression of almost 40% when forking such a process. This regression is addressed in the followup patch which replaces the pointer to a name with a refcounted structure that allows sharing the name pointer between vmas of the same name. Instead of duplicating the string during fork() or when splitting a vma it increments the refcount. [1] https://lore.kernel.org/linux-mm/20200901161459.11772-4-sumit.semwal@linaro.org/ [2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/ [3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/ [4] https://lore.kernel.org/linux-mm/5d0358ab-8c47-2f5f-8e43-23b89d6a8e95@intel.com/ Changes for prctl(2) manual page (in the options section): PR_SET_VMA Sets an attribute specified in arg2 for virtual memory areas starting from the address specified in arg3 and spanning the size specified in arg4. arg5 specifies the value of the attribute to be set. Note that assigning an attribute to a virtual memory area might prevent it from being merged with adjacent virtual memory areas due to the difference in that attribute's value. Currently, arg2 must be one of: PR_SET_VMA_ANON_NAME Set a name for anonymous virtual memory areas. arg5 should be a pointer to a null-terminated string containing the name. The name length including null byte cannot exceed 80 bytes. If arg5 is NULL, the name of the appropriate anonymous virtual memory areas will be reset. The name can contain only printable ascii characters (including space), except '[',']','\','$' and '`'. This feature is available only if the kernel is built with the CONFIG_ANON_VMA_NAME option enabled. [surenb@google.com: docs: proc.rst: /proc/PID/maps: fix malformed table] Link: https://lkml.kernel.org/r/20211123185928.2513763-1-surenb@google.com [surenb: rebased over v5.15-rc6, replaced userpointer with a kernel copy, added input sanitization and CONFIG_ANON_VMA_NAME config. The bulk of the work here was done by Colin Cross, therefore, with his permission, keeping him as the author] Link: https://lkml.kernel.org/r/20211019215511.3771969-2-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Stephen Rothwell Cc: Al Viro Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 6 +- fs/proc/task_mmu.c | 12 +++- fs/userfaultfd.c | 7 +- include/linux/mm.h | 13 +++- include/linux/mm_types.h | 64 ++++++++++++++++-- include/uapi/linux/prctl.h | 3 + kernel/fork.c | 2 + kernel/sys.c | 63 ++++++++++++++++++ mm/Kconfig | 14 ++++ mm/madvise.c | 129 +++++++++++++++++++++++++++++++++++-- mm/mempolicy.c | 3 +- mm/mlock.c | 2 +- mm/mmap.c | 38 ++++++----- mm/mprotect.c | 2 +- 14 files changed, 324 insertions(+), 34 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 8d7f141c6fc7..061744c436d9 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ======= ==================================== + ============= ==================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - ======= ==================================== + [anon:] an anonymous mapping that has been + named by userspace + ============= ==================================== or if empty, the mapping is anonymous. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad667dbc96f5..e6998652fd67 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -308,6 +308,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + const char *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +321,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name); + } } done: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22bf14ab2d16..5b2af7b82776 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -877,7 +877,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1436,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + vma_anon_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1614,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..7000442984b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2658,7 +2658,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3391,5 +3391,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..799e2ee626b2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -426,11 +426,19 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + char *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -875,4 +883,52 @@ typedef struct { unsigned long val; } swp_entry_t; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index bb73e9a0b24f..e998764f0262 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -272,4 +272,7 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..4cf20b5f2da3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,12 +365,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_vma_anon_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { + free_vma_anon_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index 8fdac0d90504..2450a9f33cb0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2261,6 +2261,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int prctl_set_vma(unsigned long opt, unsigned long addr, + unsigned long size, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + const char __user *uname; + char *name, *pch; + int error; + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + uname = (const char __user *)arg; + if (uname) { + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + } else { + /* Reset the name */ + name = NULL; + } + + mmap_write_lock(mm); + error = madvise_set_anon_name(mm, addr, size, name); + mmap_write_unlock(mm); + kfree(name); + break; + default: + error = -EINVAL; + } + + return error; +} + +#else /* CONFIG_ANON_VMA_NAME */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long size, unsigned long arg) +{ + return -EINVAL; +} +#endif /* CONFIG_ANON_VMA_NAME */ + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2530,6 +2590,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..53d7485fc38f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -900,6 +900,20 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + source "mm/damon/Kconfig" endmenu diff --git a/mm/madvise.c b/mm/madvise.c index 4b9c5509990c..413bbc6e40a0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -62,19 +63,84 @@ static int madvise_need_mmap_write(int behavior) } } +#ifdef CONFIG_ANON_VMA_NAME +static inline bool has_vma_anon_name(struct vm_area_struct *vma) +{ + return !vma->vm_file && vma->anon_name; +} + +const char *vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return NULL; + + mmap_assert_locked(vma->vm_mm); + + return vma->anon_name; +} + +void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + if (!has_vma_anon_name(orig_vma)) + return; + + new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); +} + +void free_vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return; + + kfree(vma->anon_name); + vma->anon_name = NULL; +} + +/* mmap_lock should be write-locked */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (!name) { + free_vma_anon_name(vma); + return 0; + } + + if (vma->anon_name) { + /* Same name, nothing to do here */ + if (!strcmp(name, vma->anon_name)) + return 0; + + free_vma_anon_name(vma); + } + vma->anon_name = kstrdup(name, GFP_KERNEL); + if (!vma->anon_name) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags) + unsigned long end, unsigned long new_flags, + const char *name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags) { + if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { *prev = vma; return 0; } @@ -82,7 +148,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, name); if (*prev) { vma = *prev; goto success; @@ -111,6 +177,11 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_vma_anon_name(vma, name); + if (error) + return error; + } return 0; } @@ -938,7 +1009,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } - error = madvise_update_vma(vma, prev, start, end, new_flags); + error = madvise_update_vma(vma, prev, start, end, new_flags, + vma_anon_name(vma)); out: /* @@ -1118,6 +1190,55 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, return unmapped_error; } +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (const char *)name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f6248affaf38..679f47b3a079 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index e263d62ae2d0..8f584eddd305 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..85edb0011453 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1029,7 +1029,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1048,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!is_same_vma_anon_name(vma, anon_name)) + return 0; return 1; } @@ -1079,9 +1082,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1104,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1118,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1165,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1196,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1205,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1228,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1754,7 +1760,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1803,7 +1809,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file @@ -3056,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3249,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e552f5e0ccbd..0138dfcdb1d8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); -- cgit From 78db3412833dc9c479cd17412035f216cfd01a29 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:03 -0800 Subject: mm: add anonymous vma name refcounting While forking a process with high number (64K) of named anonymous vmas the overhead caused by strdup() is noticeable. Experiments with ARM64 Android device show up to 40% performance regression when forking a process with 64k unpopulated anonymous vmas using the max name lengths vs the same process with the same number of anonymous vmas having no name. Introduce anon_vma_name refcounted structure to avoid the overhead of copying vma names during fork() and when splitting named anonymous vmas. When a vma is duplicated, instead of copying the name we increment the refcount of this structure. Multiple vmas can point to the same anon_vma_name as long as they increment the refcount. The name member of anon_vma_name structure is assigned at structure allocation time and is never changed. If vma name changes then the refcount of the original structure is dropped, a new anon_vma_name structure is allocated to hold the new name and the vma pointer is updated to point to the new structure. With this approach the fork() performance regressions is reduced 3-4x times and with usecases using more reasonable number of VMAs (a few thousand) the regressions is not measurable. Link: https://lkml.kernel.org/r/20211019215511.3771969-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Al Viro Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 9 ++++++++- mm/madvise.c | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 799e2ee626b2..449b6eafc695 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -386,6 +387,12 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -437,7 +444,7 @@ struct vm_area_struct { unsigned long rb_subtree_last; } shared; /* Serialized by mmap_sem. */ - char *anon_name; + struct anon_vma_name *anon_name; }; /* diff --git a/mm/madvise.c b/mm/madvise.c index 413bbc6e40a0..c63aacbbfa78 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -64,6 +64,29 @@ static int madvise_need_mmap_write(int behavior) } #ifdef CONFIG_ANON_VMA_NAME +static struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + struct anon_vma_name *anon_name; + size_t count; + + /* Add 1 for NUL terminator at the end of the anon_name->name */ + count = strlen(name) + 1; + anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + memcpy(anon_name->name, name, count); + } + + return anon_name; +} + +static void vma_anon_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + static inline bool has_vma_anon_name(struct vm_area_struct *vma) { return !vma->vm_file && vma->anon_name; @@ -76,7 +99,7 @@ const char *vma_anon_name(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); - return vma->anon_name; + return vma->anon_name->name; } void dup_vma_anon_name(struct vm_area_struct *orig_vma, @@ -85,34 +108,41 @@ void dup_vma_anon_name(struct vm_area_struct *orig_vma, if (!has_vma_anon_name(orig_vma)) return; - new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); + kref_get(&orig_vma->anon_name->kref); + new_vma->anon_name = orig_vma->anon_name; } void free_vma_anon_name(struct vm_area_struct *vma) { + struct anon_vma_name *anon_name; + if (!has_vma_anon_name(vma)) return; - kfree(vma->anon_name); + anon_name = vma->anon_name; vma->anon_name = NULL; + kref_put(&anon_name->kref, vma_anon_name_free); } /* mmap_lock should be write-locked */ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) { + const char *anon_name; + if (!name) { free_vma_anon_name(vma); return 0; } - if (vma->anon_name) { + anon_name = vma_anon_name(vma); + if (anon_name) { /* Same name, nothing to do here */ - if (!strcmp(name, vma->anon_name)) + if (!strcmp(name, anon_name)) return 0; free_vma_anon_name(vma); } - vma->anon_name = kstrdup(name, GFP_KERNEL); + vma->anon_name = anon_vma_name_alloc(name); if (!vma->anon_name) return -ENOMEM; -- cgit From 17fca131cee21724ee953a17c185c14e9533af5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:07 -0800 Subject: mm: move anon_vma declarations to linux/mm_inline.h The patch to add anonymous vma names causes a build failure in some configurations: include/linux/mm_types.h: In function 'is_same_vma_anon_name': include/linux/mm_types.h:924:37: error: implicit declaration of function 'strcmp' [-Werror=implicit-function-declaration] 924 | return name && vma_name && !strcmp(name, vma_name); | ^~~~~~ include/linux/mm_types.h:22:1: note: 'strcmp' is defined in header ''; did you forget to '#include '? This should not really be part of linux/mm_types.h in the first place, as that header is meant to only contain structure defintions and need a minimum set of indirect includes itself. While the header clearly includes more than it should at this point, let's not make it worse by including string.h as well, which would pull in the expensive (compile-speed wise) fortify-string logic. Move the new functions into a separate header that only needs to be included in a couple of locations. Link: https://lkml.kernel.org/r/20211207125710.2503446-1-arnd@kernel.org Fixes: "mm: add a field to store names for private anonymous memory" Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Colin Cross Cc: Eric Biederman Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 1 + fs/userfaultfd.c | 1 + include/linux/mm_inline.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 48 --------------------------------------------- kernel/fork.c | 1 + mm/madvise.c | 1 + mm/mmap.c | 1 + 7 files changed, 55 insertions(+), 48 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e6998652fd67..18f8c3acbb85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5b2af7b82776..e26b10132d47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e2ec68b0515c..47d96d2647ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -135,4 +136,53 @@ static __always_inline void del_page_from_lru_list(struct page *page, { lruvec_del_folio(lruvec, page_folio(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 449b6eafc695..4d5fb84eed5e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -890,52 +890,4 @@ typedef struct { unsigned long val; } swp_entry_t; -#ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. - */ -extern const char *vma_anon_name(struct vm_area_struct *vma); - -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); - -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); - -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - const char *vma_name = vma_anon_name(vma); - - /* either both NULL, or pointers to same string */ - if (vma_name == name) - return true; - - return name && vma_name && !strcmp(name, vma_name); -} -#else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) -{ - return NULL; -} -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - return true; -} -#endif /* CONFIG_ANON_VMA_NAME */ - #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 4cf20b5f2da3..75737e566441 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index c63aacbbfa78..5604064df464 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index 85edb0011453..77733b113c40 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit From 36090def7bad06a6346f86a7cfdbfda2d138cb64 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:10 -0800 Subject: mm: move tlb_flush_pending inline helpers to mm_inline.h linux/mm_types.h should only define structure definitions, to make it cheap to include elsewhere. The atomic_t helper function definitions are particularly large, so it's better to move the helpers using those into the existing linux/mm_inline.h and only include that where needed. As a follow-up, we may want to go through all the indirect includes in mm_types.h and reduce them as much as possible. Link: https://lkml.kernel.org/r/20211207125710.2503446-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Colin Cross Cc: Kees Cook Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Yu Zhao Cc: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 2 +- include/linux/mm.h | 45 -------------- include/linux/mm_inline.h | 86 +++++++++++++++++++++++++++ include/linux/mm_types.h | 129 ++++++++++++++--------------------------- mm/ksm.c | 1 + mm/mapping_dirty_helpers.c | 1 + mm/memory.c | 1 + mm/mmu_gather.c | 1 + mm/pgtable-generic.c | 1 + 9 files changed, 137 insertions(+), 130 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..5196958aa6ac 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -752,7 +752,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return true; if ((pte_flags(a) & _PAGE_PROTNONE) && - mm_tlb_flush_pending(mm)) + atomic_read(&mm->tlb_flush_pending)) return true; return false; diff --git a/include/linux/mm.h b/include/linux/mm.h index 7000442984b9..c17e5cfc1e47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -424,51 +424,6 @@ extern unsigned int kobjsize(const void *objp); */ extern pgprot_t protection_map[16]; -/** - * enum fault_flag - Fault flag definitions. - * @FAULT_FLAG_WRITE: Fault was a write fault. - * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. - * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. - * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. - * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. - * @FAULT_FLAG_TRIED: The fault has been tried once. - * @FAULT_FLAG_USER: The fault originated in userspace. - * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. - * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. - * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * - * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify - * whether we would allow page faults to retry by specifying these two - * fault flags correctly. Currently there can be three legal combinations: - * - * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and - * this is the first try - * - * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and - * we've already tried at least once - * - * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry - * - * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never - * be used. Note that page faults can be allowed to retry for multiple times, - * in which case we'll have an initial fault with flags (a) then later on - * continuous faults with flags (b). We should always try to detect pending - * signals before a retry to make sure the continuous page faults can still be - * interrupted if necessary. - */ -enum fault_flag { - FAULT_FLAG_WRITE = 1 << 0, - FAULT_FLAG_MKWRITE = 1 << 1, - FAULT_FLAG_ALLOW_RETRY = 1 << 2, - FAULT_FLAG_RETRY_NOWAIT = 1 << 3, - FAULT_FLAG_KILLABLE = 1 << 4, - FAULT_FLAG_TRIED = 1 << 5, - FAULT_FLAG_USER = 1 << 6, - FAULT_FLAG_REMOTE = 1 << 7, - FAULT_FLAG_INSTRUCTION = 1 << 8, - FAULT_FLAG_INTERRUPTIBLE = 1 << 9, -}; - /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 47d96d2647ca..b725839dfe71 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,6 +2,7 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include #include #include #include @@ -185,4 +186,89 @@ static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline void init_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); + /* + * The only time this value is relevant is when there are indeed pages + * to flush. And we'll only flush pages after changing them, which + * requires the PTL. + * + * So the ordering here is: + * + * atomic_inc(&mm->tlb_flush_pending); + * spin_lock(&ptl); + * ... + * set_pte_at(); + * spin_unlock(&ptl); + * + * spin_lock(&ptl) + * mm_tlb_flush_pending(); + * .... + * spin_unlock(&ptl); + * + * flush_tlb_range(); + * atomic_dec(&mm->tlb_flush_pending); + * + * Where the increment if constrained by the PTL unlock, it thus + * ensures that the increment is visible if the PTE modification is + * visible. After all, if there is no PTE modification, nobody cares + * about TLB flushes either. + * + * This very much relies on users (mm_tlb_flush_pending() and + * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and + * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc + * locks (PPC) the unlock of one doesn't order against the lock of + * another PTL. + * + * The decrement is ordered by the flush_tlb_range(), such that + * mm_tlb_flush_pending() will not return false unless all flushes have + * completed. + */ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * See inc_tlb_flush_pending(). + * + * This cannot be smp_mb__before_atomic() because smp_mb() simply does + * not order against TLB invalidate completion, which is what we need. + * + * Therefore we must rely on tlb_flush_*() to guarantee order. + */ + atomic_dec(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * Must be called after having acquired the PTL; orders against that + * PTLs release and therefore ensures that if we observe the modified + * PTE we must also observe the increment from inc_tlb_flush_pending(). + * + * That is, it only guarantees to return true if there is a flush + * pending for _this_ PTL. + */ + return atomic_read(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + /* + * Similar to mm_tlb_flush_pending(), we must have acquired the PTL + * for which there is a TLB flush pending in order to guarantee + * we've seen both that PTE modification and the increment. + * + * (no requirement on actually still holding the PTL, that is irrelevant) + */ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4d5fb84eed5e..6a89f128c990 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -692,90 +692,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb); -static inline void init_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_set(&mm->tlb_flush_pending, 0); -} - -static inline void inc_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_inc(&mm->tlb_flush_pending); - /* - * The only time this value is relevant is when there are indeed pages - * to flush. And we'll only flush pages after changing them, which - * requires the PTL. - * - * So the ordering here is: - * - * atomic_inc(&mm->tlb_flush_pending); - * spin_lock(&ptl); - * ... - * set_pte_at(); - * spin_unlock(&ptl); - * - * spin_lock(&ptl) - * mm_tlb_flush_pending(); - * .... - * spin_unlock(&ptl); - * - * flush_tlb_range(); - * atomic_dec(&mm->tlb_flush_pending); - * - * Where the increment if constrained by the PTL unlock, it thus - * ensures that the increment is visible if the PTE modification is - * visible. After all, if there is no PTE modification, nobody cares - * about TLB flushes either. - * - * This very much relies on users (mm_tlb_flush_pending() and - * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and - * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc - * locks (PPC) the unlock of one doesn't order against the lock of - * another PTL. - * - * The decrement is ordered by the flush_tlb_range(), such that - * mm_tlb_flush_pending() will not return false unless all flushes have - * completed. - */ -} - -static inline void dec_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * See inc_tlb_flush_pending(). - * - * This cannot be smp_mb__before_atomic() because smp_mb() simply does - * not order against TLB invalidate completion, which is what we need. - * - * Therefore we must rely on tlb_flush_*() to guarantee order. - */ - atomic_dec(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * Must be called after having acquired the PTL; orders against that - * PTLs release and therefore ensures that if we observe the modified - * PTE we must also observe the increment from inc_tlb_flush_pending(). - * - * That is, it only guarantees to return true if there is a flush - * pending for _this_ PTL. - */ - return atomic_read(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) -{ - /* - * Similar to mm_tlb_flush_pending(), we must have acquired the PTL - * for which there is a TLB flush pending in order to guarantee - * we've seen both that PTE modification and the increment. - * - * (no requirement on actually still holding the PTL, that is irrelevant) - */ - return atomic_read(&mm->tlb_flush_pending) > 1; -} - struct vm_fault; /** @@ -890,4 +806,49 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * enum fault_flag - Fault flag definitions. + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/ksm.c b/mm/ksm.c index 0662093237e4..f34476ac0a41 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index ea734f248fce..1b0ab8fcfd8b 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..bc80d4effac9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1b9837419bf9..afb7185ffdc4 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4e640baf9794..6523fda274e5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* -- cgit From 64591e8605d6e2fba2ff38e3227645f039b8893f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:14 -0800 Subject: mm: protect free_pgtables with mmap_lock write lock in exit_mmap oom-reaper and process_mrelease system call should protect against races with exit_mmap which can destroy page tables while they walk the VMA tree. oom-reaper protects from that race by setting MMF_OOM_VICTIM and by relying on exit_mmap to set MMF_OOM_SKIP before taking and releasing mmap_write_lock. process_mrelease has to elevate mm->mm_users to prevent such race. Both oom-reaper and process_mrelease hold mmap_read_lock when walking the VMA tree. The locking rules and mechanisms could be simpler if exit_mmap takes mmap_write_lock while executing destructive operations such as free_pgtables. Change exit_mmap to hold the mmap_write_lock when calling unlock_range, free_pgtables and remove_vma. Note also that because oom-reaper checks VM_LOCKED flag, unlock_range() should not be allowed to race with it. Before this patch, remove_vma used to be called with no locks held, however with fput being executed asynchronously and vm_ops->close not being allowed to hold mmap_lock (it is called from __split_vma with mmap_sem held for write), changing that should be fine. In most cases this lock should be uncontended. Previously, Kirill reported ~4% regression caused by a similar change [1]. We reran the same test and although the individual results are quite noisy, the percentiles show lower regression with 1.6% being the worst case [2]. The change allows oom-reaper and process_mrelease to execute safely under mmap_read_lock without worries that exit_mmap might destroy page tables from under them. [1] https://lore.kernel.org/all/20170725141723.ivukwhddk2voyhuc@node.shutemov.name/ [2] https://lore.kernel.org/all/CAJuCfpGC9-c9P40x7oy=jy5SphMcd0o0G_6U1-+JAziGKG6dGA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211209191325.3069345-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Rik van Riel Cc: Minchan Kim Cc: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Christian Brauner Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Andy Lutomirski Cc: Christian Brauner Cc: Florian Weimer Cc: Jan Engelhardt Cc: Tim Murray Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 77733b113c40..3f48d0928e6b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3149,25 +3149,27 @@ void exit_mmap(struct mm_struct *mm) * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * - * This needs to be done before calling munlock_vma_pages_all(), + * This needs to be done before calling unlock_range(), * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - mmap_write_lock(mm); - mmap_write_unlock(mm); } + mmap_write_lock(mm); if (mm->locked_vm) unlock_range(mm->mmap, ULONG_MAX); arch_exit_mmap(mm); vma = mm->mmap; - if (!vma) /* Can happen if dup_mmap() received an OOM */ + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_write_unlock(mm); return; + } lru_add_drain(); flush_cache_mm(mm); @@ -3178,16 +3180,14 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ + /* Walk the list again, actually closing and freeing it. */ while (vma) { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); cond_resched(); } + mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } -- cgit From cc6dcfee72509868271d42919a3c1081b6b0dc7e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:18 -0800 Subject: mm: document locking restrictions for vm_operations_struct::close Add comments for vm_operations_struct::close documenting locking requirements for this callback and its callers. Link: https://lkml.kernel.org/r/20211209191325.3069345-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index c17e5cfc1e47..4d7245e6802a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,6 +532,10 @@ enum page_entry_size { */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); -- cgit From ba535c1caf3ee78aa7719e9e4b07a0dc1d153b9e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:22 -0800 Subject: mm/oom_kill: allow process_mrelease to run under mmap_lock protection With exit_mmap holding mmap_write_lock during free_pgtables call, process_mrelease does not need to elevate mm->mm_users in order to prevent exit_mmap from destrying pagetables while __oom_reap_task_mm is walking the VMA tree. The change prevents process_mrelease from calling the last mmput, which can lead to waiting for IO completion in exit_aio. Link: https://lkml.kernel.org/r/20211209191325.3069345-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Reviewed-by: Jason Gunthorpe Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e52ce0b1465d..3390316c8a32 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1170,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - if (mmget_not_zero(p->mm)) { - mm = p->mm; - if (task_will_free_mem(p)) - reap = true; - else { - /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) - ret = -EINVAL; - } + mm = p->mm; + mmgrab(mm); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; } task_unlock(p); @@ -1189,13 +1189,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) ret = -EINTR; goto drop_mm; } - if (!__oom_reap_task_mm(mm)) + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); drop_mm: - if (mm) - mmput(mm); + mmdrop(mm); put_task: put_task_struct(task); return ret; -- cgit From 4b8fec2867c85e081c1c9f800e0ec82eff71134f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 14 Jan 2022 14:06:26 -0800 Subject: docs/vm: add vmalloced-kernel-stacks document Add a new document to explain Virtually Mapped Kernel Stack Support. This is a compilation of information from the code and original patch series that introduced the Virtually Mapped Kernel Stacks feature. This document summarizes the feature and provides details on allocation, free, and stack overflow handling. Provides reference to available tests. Link: https://lkml.kernel.org/r/20211215002004.47981-1-skhan@linuxfoundation.org Signed-off-by: Shuah Khan Cc: Jonathan Corbet Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/index.rst | 1 + Documentation/vm/vmalloced-kernel-stacks.rst | 153 +++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 Documentation/vm/vmalloced-kernel-stacks.rst diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index 6f5ffef4b716..b1826ca2c576 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -36,5 +36,6 @@ algorithms. If you are looking for advice on simply allocating memory, see the split_page_table_lock transhuge unevictable-lru + vmalloced-kernel-stacks z3fold zsmalloc diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/vm/vmalloced-kernel-stacks.rst new file mode 100644 index 000000000000..fc8c67833af6 --- /dev/null +++ b/Documentation/vm/vmalloced-kernel-stacks.rst @@ -0,0 +1,153 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Virtually Mapped Kernel Stack Support +===================================== + +:Author: Shuah Khan + +.. contents:: :local: + +Overview +-------- + +This is a compilation of information from the code and original patch +series that introduced the `Virtually Mapped Kernel Stacks feature +` + +Introduction +------------ + +Kernel stack overflows are often hard to debug and make the kernel +susceptible to exploits. Problems could show up at a later time making +it difficult to isolate and root-cause. + +Virtually-mapped kernel stacks with guard pages causes kernel stack +overflows to be caught immediately rather than causing difficult to +diagnose corruptions. + +HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable +support for virtually mapped stacks with guard pages. This feature +causes reliable faults when the stack overflows. The usability of +the stack trace after overflow and response to the overflow itself +is architecture dependent. + +.. note:: + As of this writing, arm64, powerpc, riscv, s390, um, and x86 have + support for VMAP_STACK. + +HAVE_ARCH_VMAP_STACK +-------------------- + +Architectures that can support Virtually Mapped Kernel Stacks should +enable this bool configuration option. The requirements are: + +- vmalloc space must be large enough to hold many kernel stacks. This + may rule out many 32-bit architectures. +- Stacks in vmalloc space need to work reliably. For example, if + vmap page tables are created on demand, either this mechanism + needs to work while the stack points to a virtual address with + unpopulated page tables or arch code (switch_to() and switch_mm(), + most likely) needs to ensure that the stack's page table entries + are populated before running on a possibly unpopulated stack. +- If the stack overflows into a guard page, something reasonable + should happen. The definition of "reasonable" is flexible, but + instantly rebooting without logging anything would be unfriendly. + +VMAP_STACK +---------- + +VMAP_STACK bool configuration option when enabled allocates virtually +mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. + +- Enable this if you want the use virtually-mapped kernel stacks + with guard pages. This causes kernel stack overflows to be caught + immediately rather than causing difficult-to-diagnose corruption. + +.. note:: + + Using this feature with KASAN requires architecture support + for backing virtual mappings with real shadow memory, and + KASAN_VMALLOC must be enabled. + +.. note:: + + VMAP_STACK is enabled, it is not possible to run DMA on stack + allocated data. + +Kernel configuration options and dependencies keep changing. Refer to +the latest code base: + +`Kconfig ` + +Allocation +----------- + +When a new kernel thread is created, thread stack is allocated from +virtually contiguous memory pages from the page level allocator. These +pages are mapped into contiguous kernel virtual space with PAGE_KERNEL +protections. + +alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack +with PAGE_KERNEL protections. + +- Allocated stacks are cached and later reused by new threads, so memcg + accounting is performed manually on assigning/releasing stacks to tasks. + Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. +- vm_struct is cached to be able to find when thread free is initiated + in interrupt context. free_thread_stack() can be called in interrupt + context. +- On arm64, all VMAP's stacks need to have the same alignment to ensure + that VMAP'd stack overflow detection works correctly. Arch specific + vmap stack allocator takes care of this detail. +- This does not address interrupt stacks - according to the original patch + +Thread stack allocation is initiated from clone(), fork(), vfork(), +kernel_thread() via kernel_clone(). Leaving a few hints for searching +the code base to understand when and how thread stack is allocated. + +Bulk of the code is in: +`kernel/fork.c `. + +stack_vm_area pointer in task_struct keeps track of the virtually allocated +stack and a non-null stack_vm_area pointer serves as a indication that the +virtually mapped kernel stacks are enabled. + +:: + + struct vm_struct *stack_vm_area; + +Stack overflow handling +----------------------- + +Leading and trailing guard pages help detect stack overflows. When stack +overflows into the guard pages, handlers have to be careful not overflow +the stack again. When handlers are called, it is likely that very little +stack space is left. + +On x86, this is done by handling the page fault indicating the kernel +stack overflow on the double-fault stack. + +Testing VMAP allocation with guard pages +---------------------------------------- + +How do we ensure that VMAP_STACK is actually allocating with a leading +and trailing guard page? The following lkdtm tests can help detect any +regressions. + +:: + + void lkdtm_STACK_GUARD_PAGE_LEADING() + void lkdtm_STACK_GUARD_PAGE_TRAILING() + +Conclusions +----------- + +- A percpu cache of vmalloced stacks appears to be a bit faster than a + high-order stack allocation, at least when the cache hits. +- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and + simply embed the thread_info (containing only flags) and 'int cpu' into + task_struct. +- The thread stack can be free'ed as soon as the task is dead (without + waiting for RCU) and then, if vmapped stacks are in use, cache the + entire stack for reuse on the same cpu. -- cgit From 1eba86c096e35e3cc83de1ad2c26f2d70470211b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:29 -0800 Subject: mm: change page type prior to adding page table entry Patch series "page table check", v3. Ensure that some memory corruptions are prevented by checking at the time of insertion of entries into user page tables that there is no illegal sharing. We have recently found a problem [1] that existed in kernel since 4.14. The problem was caused by broken page ref count and led to memory leaking from one process into another. The problem was accidentally detected by studying a dump of one process and noticing that one page contains memory that should not belong to this process. There are some other page->_refcount related problems that were recently fixed: [2], [3] which potentially could also lead to illegal sharing. In addition to hardening refcount [4] itself, this work is an attempt to prevent this class of memory corruption issues. It uses a simple state machine that is independent from regular MM logic to check for illegal sharing at time pages are inserted and removed from page tables. [1] https://lore.kernel.org/all/xr9335nxwc5y.fsf@gthelen2.svl.corp.google.com [2] https://lore.kernel.org/all/1582661774-30925-2-git-send-email-akaher@vmware.com [3] https://lore.kernel.org/all/20210622021423.154662-3-mike.kravetz@oracle.com [4] https://lore.kernel.org/all/20211221150140.988298-1-pasha.tatashin@soleen.com This patch (of 4): There are a few places where we first update the entry in the user page table, and later change the struct page to indicate that this is anonymous or file page. In most places, however, we first configure the page metadata and then insert entries into the page table. Page table check, will use the information from struct page to verify the type of entry is inserted. Change the order in all places to first update struct page, and later to update page table. This means that we first do calls that may change the type of page (anon or file): page_move_anon_rmap page_add_anon_rmap do_page_add_anon_rmap page_add_new_anon_rmap page_add_file_rmap hugepage_add_anon_rmap hugepage_add_new_anon_rmap And after that do calls that add entries to the page table: set_huge_pte_at set_pte_at Link: https://lkml.kernel.org/r/20211221154650.1047963-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20211221154650.1047963-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: David Rientjes Cc: Paul Turner Cc: Wei Xu Cc: Greg Thelen Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Will Deacon Cc: Mike Rapoport Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Masahiro Yamada Cc: Sami Tolvanen Cc: Dave Hansen Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Aneesh Kumar K.V Cc: Jiri Slaby Cc: Muchun Song Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 +++--- mm/memory.c | 9 +++++---- mm/migrate.c | 5 ++--- mm/swapfile.c | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1baa198519a..61895cc01d09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4684,8 +4684,8 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr struct page *new_page) { __SetPageUptodate(new_page); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugepage_add_new_anon_rmap(new_page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); @@ -5259,10 +5259,10 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_huge_pte_at(mm, haddr, ptep, + make_huge_pte(vma, new_page, 1)); SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; diff --git a/mm/memory.c b/mm/memory.c index bc80d4effac9..5fea331b1560 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -720,8 +720,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); - set_pte_at(vma->vm_mm, address, ptep, pte); - /* * No need to take a page reference as one was already * created when the swap entry was made. @@ -735,6 +733,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, */ WARN_ON_ONCE(!PageAnon(page)); + set_pte_at(vma->vm_mm, address, ptep, pte); + if (vma->vm_flags & VM_LOCKED) mlock_vma_page(page); @@ -3640,8 +3640,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -3652,6 +3650,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) do_page_add_anon_rmap(page, vma, vmf->address, exclusive); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) diff --git a/mm/migrate.c b/mm/migrate.c index cf25b00f03c8..6aa4b5326784 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -236,20 +236,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif { - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else page_add_file_rmap(new, false); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); diff --git a/mm/swapfile.c b/mm/swapfile.c index e59e08ef46e1..e64207e2ef1d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1917,14 +1917,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); out: pte_unmap_unlock(pte, ptl); -- cgit From 08d5b29eac7dd5e6c79b66d390ecbb9219e05931 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:33 -0800 Subject: mm: ptep_clear() page table helper We have ptep_get_and_clear() and ptep_get_and_clear_full() helpers to clear PTE from user page tables, but there is no variant for simple clear of a present PTE from user page tables without using a low level pte_clear() which can be either native or para-virtualised. Add a new ptep_clear() that can be used in common code to clear PTEs from page table. We will need this call later in order to add a hook for page table check. Link: https://lkml.kernel.org/r/20211221154650.1047963-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/arch_pgtable_helpers.rst | 6 ++++-- include/linux/pgtable.h | 8 ++++++++ mm/debug_vm_pgtable.c | 2 +- mm/khugepaged.c | 12 ++---------- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst index b3166c33db39..f8b225fc9190 100644 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ b/Documentation/vm/arch_pgtable_helpers.rst @@ -66,9 +66,11 @@ PTE Page Table Helpers +---------------------------+--------------------------------------------------+ | pte_mknotpresent | Invalidates a mapped PTE | +---------------------------+--------------------------------------------------+ -| ptep_get_and_clear | Clears a PTE | +| ptep_clear | Clears a PTE | +---------------------------+--------------------------------------------------+ -| ptep_get_and_clear_full | Clears a PTE | +| ptep_get_and_clear | Clears and returns PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | +---------------------------+--------------------------------------------------+ | ptep_test_and_clear_young | Clears young from a PTE | +---------------------------+--------------------------------------------------+ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e24d2c992b11..bc8713a76e03 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -258,6 +258,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 2a2b24e87877..a7ac97c76762 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -652,7 +652,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) set_pte_at(args->mm, args->vaddr, args->ptep, pte); flush_dcache_page(page); barrier(); - pte_clear(args->mm, args->vaddr, args->ptep); + ptep_clear(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e99101162f1a..9d40dd8890e5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -756,11 +756,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * ptl mostly unnecessary. */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); } } else { @@ -774,11 +770,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * inside page_remove_rmap(). */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); -- cgit From df4e817b710809425d899340dbfa8504a3ca4ba5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:37 -0800 Subject: mm: page table check Check user page table entries at the time they are added and removed. Allows to synchronously catch memory corruption issues related to double mapping. When a pte for an anonymous page is added into page table, we verify that this pte does not already point to a file backed page, and vice versa if this is a file backed page that is being added we verify that this page does not have an anonymous mapping We also enforce that read-only sharing for anonymous pages is allowed (i.e. cow after fork). All other sharing must be for file pages. Page table check allows to protect and debug cases where "struct page" metadata became corrupted for some reason. For example, when refcnt or mapcount become invalid. Link: https://lkml.kernel.org/r/20211221154650.1047963-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/index.rst | 1 + Documentation/vm/page_table_check.rst | 56 +++++++ MAINTAINERS | 9 ++ arch/Kconfig | 3 + include/linux/page_table_check.h | 147 ++++++++++++++++++ mm/Kconfig.debug | 24 +++ mm/Makefile | 1 + mm/page_alloc.c | 4 + mm/page_ext.c | 4 + mm/page_table_check.c | 270 ++++++++++++++++++++++++++++++++++ 10 files changed, 519 insertions(+) create mode 100644 Documentation/vm/page_table_check.rst create mode 100644 include/linux/page_table_check.h create mode 100644 mm/page_table_check.c diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index b1826ca2c576..932440805453 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -31,6 +31,7 @@ algorithms. If you are looking for advice on simply allocating memory, see the page_migration page_frags page_owner + page_table_check remap_file_pages slub split_page_table_lock diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst new file mode 100644 index 000000000000..81f521ff7ea7 --- /dev/null +++ b/Documentation/vm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _page_table_check: + +================ +Page Table Check +================ + +Introduction +============ + +Page table check allows to hardern the kernel by ensuring that some types of +the memory corruptions are prevented. + +Page table check performs extra verifications at the time when new pages become +accessible from the userspace by getting their page table entries (PTEs PMDs +etc.) added into the table. + +In case of detected corruption, the kernel is crashed. There is a small +performance and memory overhead associated with the page table check. Therefore, +it is disabled by default, but can be optionally enabled on systems where the +extra hardening outweighs the performance costs. Also, because page table check +is synchronous, it can help with debugging double map memory corruption issues, +by crashing kernel at the time wrong mapping occurs instead of later which is +often the case with memory corruptions bugs. + +Double mapping detection logic +============================== + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +Enabling Page Table Check +========================= + +Build kernel with: + +- PAGE_TABLE_CHECK=y + Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK + is available. + +- Boot with 'page_table_check=on' kernel parameter. + +Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page +table support without extra kernel parameter. diff --git a/MAINTAINERS b/MAINTAINERS index dd36acc87ce6..fbdb860c0b8b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14387,6 +14387,15 @@ F: include/net/page_pool.h F: include/trace/events/page_pool.h F: net/core/page_pool.c +PAGE TABLE CHECK +M: Pasha Tatashin +M: Andrew Morton +L: linux-mm@kvack.org +S: Maintained +F: Documentation/vm/page_table_check.rst +F: include/linux/page_table_check.h +F: mm/page_table_check.c + PANASONIC LAPTOP ACPI EXTRAS DRIVER M: Kenneth Chan L: platform-driver-x86@vger.kernel.org diff --git a/arch/Kconfig b/arch/Kconfig index d3c4ab249e9c..4568b6b70b5d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1297,6 +1297,9 @@ config HAVE_ARCH_PFN_VALID config ARCH_SUPPORTS_DEBUG_PAGEALLOC bool +config ARCH_SUPPORTS_PAGE_TABLE_CHECK + bool + config ARCH_SPLIT_ARG64 bool help diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h new file mode 100644 index 000000000000..38cace1da7b6 --- /dev/null +++ b/include/linux/page_table_check.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#ifndef __LINUX_PAGE_TABLE_CHECK_H +#define __LINUX_PAGE_TABLE_CHECK_H + +#ifdef CONFIG_PAGE_TABLE_CHECK +#include + +extern struct static_key_true page_table_check_disabled; +extern struct page_ext_operations page_table_check_ops; + +void __page_table_check_zero(struct page *page, unsigned int order); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear(mm, addr, pte); +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_clear(mm, addr, pmd); +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_clear(mm, addr, pud); +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_set(mm, addr, ptep, pte); +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_set(mm, addr, pmdp, pmd); +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_set(mm, addr, pudp, pud); +} + +#else + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ +} + +#endif /* CONFIG_PAGE_TABLE_CHECK */ +#endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1e73717802f8..5bd5bb097252 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,30 @@ config PAGE_OWNER If unsure, say N. +config PAGE_TABLE_CHECK + bool "Check for invalid mappings in user page tables" + depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK + select PAGE_EXTENSION + help + Check that anonymous page is not being mapped twice with read write + permissions. Check that anonymous and file pages are not being + erroneously shared. Since the checking is performed at the time + entries are added and removed to user page tables, leaking, corruption + and double mapping problems are detected synchronously. + + If unsure say "n". + +config PAGE_TABLE_CHECK_ENFORCED + bool "Enforce the page table checking by default" + depends on PAGE_TABLE_CHECK + help + Always enable page table checking. By default the page table checking + is disabled, and can be optionally enabled via page_table_check=on + kernel parameter. This config enforces that page table check is always + enabled. + + If unsure say "n". + config PAGE_POISONING bool "Poison pages after freeing" help diff --git a/mm/Makefile b/mm/Makefile index d6c0042e3aa0..5c5a3a480fa6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -112,6 +112,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d59023a676ed..806f317c2e7e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -1307,6 +1308,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + page_table_check_free(page, order); return false; } @@ -1346,6 +1348,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + page_table_check_free(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -2420,6 +2423,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, diff --git a/mm/page_ext.c b/mm/page_ext.c index 6242afb24d84..bee3240604dc 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -8,6 +8,7 @@ #include #include #include +#include /* * struct page extension @@ -75,6 +76,9 @@ static struct page_ext_operations *page_ext_ops[] = { #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_TABLE_CHECK + &page_table_check_ops, +#endif }; unsigned long page_ext_size = sizeof(struct page_ext); diff --git a/mm/page_table_check.c b/mm/page_table_check.c new file mode 100644 index 000000000000..7504e7caa2a1 --- /dev/null +++ b/mm/page_table_check.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "page_table_check: " fmt + +struct page_table_check { + atomic_t anon_map_count; + atomic_t file_map_count; +}; + +static bool __page_table_check_enabled __initdata = + IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); + +DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); +EXPORT_SYMBOL(page_table_check_disabled); + +static int __init early_page_table_check_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + __page_table_check_enabled = true; + else if (strcmp(buf, "off") == 0) + __page_table_check_enabled = false; + + return 0; +} + +early_param("page_table_check", early_page_table_check_param); + +static bool __init need_page_table_check(void) +{ + return __page_table_check_enabled; +} + +static void __init init_page_table_check(void) +{ + if (!__page_table_check_enabled) + return; + static_branch_disable(&page_table_check_disabled); +} + +struct page_ext_operations page_table_check_ops = { + .size = sizeof(struct page_table_check), + .need = need_page_table_check, + .init = init_page_table_check, +}; + +static struct page_table_check *get_page_table_check(struct page_ext *page_ext) +{ + BUG_ON(!page_ext); + return (void *)(page_ext) + page_table_check_ops.offset; +} + +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && + (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && + (pud_val(pud) & _PAGE_USER); +} + +/* + * An enty is removed from the page table, decrement the counters for that page + * verify that it is of correct type and counters do not become negative. + */ +static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * A new enty is added to the page table, increment the counters for that page + * verify that it is of correct type and is not being mapped with a different + * type to a different process. + */ +static void page_table_check_set(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt, + bool rw) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * page is on free list, or is being allocated, verify that counters are zeroes + * crash if they are not. + */ +void __page_table_check_zero(struct page *page, unsigned int order) +{ + struct page_ext *page_ext = lookup_page_ext(page); + int i; + + BUG_ON(!page_ext); + for (i = 0; i < (1 << order); i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_read(&ptc->file_map_count)); + page_ext = page_ext_next(page_ext); + } +} + +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) +{ + if (&init_mm == mm) + return; + + if (pte_user_accessible_page(pte)) { + page_table_check_clear(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pte_clear); + +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (pmd_user_accessible_page(pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_clear); + +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) +{ + if (&init_mm == mm) + return; + + if (pud_user_accessible_page(pud)) { + page_table_check_clear(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pud_clear); + +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + pte_t old_pte; + + if (&init_mm == mm) + return; + + old_pte = *ptep; + if (pte_user_accessible_page(old_pte)) { + page_table_check_clear(mm, addr, pte_pfn(old_pte), + PAGE_SIZE >> PAGE_SHIFT); + } + + if (pte_user_accessible_page(pte)) { + page_table_check_set(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT, + pte_write(pte)); + } +} +EXPORT_SYMBOL(__page_table_check_pte_set); + +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old_pmd; + + if (&init_mm == mm) + return; + + old_pmd = *pmdp; + if (pmd_user_accessible_page(old_pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(old_pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pmd_user_accessible_page(pmd)) { + page_table_check_set(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT, + pmd_write(pmd)); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_set); + +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + pud_t old_pud; + + if (&init_mm == mm) + return; + + old_pud = *pudp; + if (pud_user_accessible_page(old_pud)) { + page_table_check_clear(mm, addr, pud_pfn(old_pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pud_user_accessible_page(pud)) { + page_table_check_set(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT, + pud_write(pud)); + } +} +EXPORT_SYMBOL(__page_table_check_pud_set); -- cgit From d283d422c6c4f0264fe8ecf5ae80036bf73f4594 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:41 -0800 Subject: x86: mm: add x86_64 support for page table check Add page table check hooks into routines that modify user page tables. Link: https://lkml.kernel.org/r/20211221154650.1047963-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 29 +++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c2ccb85f2ef..d0628415b93e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -104,6 +104,7 @@ config X86 select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5196958aa6ac..d7d287ac1018 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -26,6 +26,7 @@ #include #include #include +#include extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -1006,18 +1007,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { + page_table_check_pte_set(mm, addr, ptep, pte); set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(mm, addr, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { + page_table_check_pud_set(mm, addr, pudp, pud); native_set_pud(pudp, pud); } @@ -1048,6 +1052,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); return pte; } @@ -1063,12 +1068,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } return pte; } +#define __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) + ptep_get_and_clear(mm, addr, ptep); + else + pte_clear(mm, addr, ptep); +} + #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -1109,14 +1125,22 @@ static inline int pmd_write(pmd_t pmd) static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - return native_pmdp_get_and_clear(pmdp); + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + + page_table_check_pmd_clear(mm, addr, pmd); + + return pmd; } #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - return native_pudp_get_and_clear(pudp); + pud_t pud = native_pudp_get_and_clear(pudp); + + page_table_check_pud_clear(mm, addr, pud); + + return pud; } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -1137,6 +1161,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { -- cgit From 020e87650af9f43683546729f959fdc78422a4b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:44 -0800 Subject: mm: remove last argument of reuse_swap_page() None of the callers care about the total_map_swapcount() any more. Link: https://lkml.kernel.org/r/20211220205943.456187-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: William Kucharski Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 +++--- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 2 +- mm/swapfile.c | 8 +------- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d1ea44b31f19..bdccbf1efa61 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *, int *); +extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_map_swapcount) \ - (page_trans_huge_mapcount(page, total_map_swapcount) == 1) +#define reuse_swap_page(page) \ + (page_trans_huge_mapcount(page, NULL) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e5483347291c..b61fbe95c856 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (reuse_swap_page(page, NULL)) { + if (reuse_swap_page(page)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9d40dd8890e5..698ea19775ac 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -681,7 +681,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { + !reuse_swap_page(page)) { /* * Page is in the swap cache and cannot be re-used. * It cannot be collapsed into a THP. diff --git a/mm/memory.c b/mm/memory.c index 5fea331b1560..571d02f419ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3627,7 +3627,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; diff --git a/mm/swapfile.c b/mm/swapfile.c index e64207e2ef1d..31d13a393cf0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1668,12 +1668,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. - * - * NOTE: total_map_swapcount should not be relied upon by the caller if - * reuse_swap_page() returns false, but it may be always overwritten - * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_map_swapcount) +bool reuse_swap_page(struct page *page) { int count, total_mapcount, total_swapcount; @@ -1682,8 +1678,6 @@ bool reuse_swap_page(struct page *page, int *total_map_swapcount) return false; count = page_trans_huge_map_swapcount(page, &total_mapcount, &total_swapcount); - if (total_map_swapcount) - *total_map_swapcount = total_mapcount + total_swapcount; if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ -- cgit From 66c7f7a6ac6624fc7e226d43913e10f1f047f579 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:48 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_map_swapcount() Now that we don't report it to the caller of reuse_swap_page(), we don't need to request it from page_trans_huge_map_swapcount(). Link: https://lkml.kernel.org/r/20211220205943.456187-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 31d13a393cf0..a93f5b5fc8b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, +static int page_trans_huge_map_swapcount(struct page *page, int *total_swapcount) { - int i, map_swapcount, _total_mapcount, _total_swapcount; + int i, map_swapcount, _total_swapcount; unsigned long offset = 0; struct swap_info_struct *si; struct swap_cluster_info *ci = NULL; unsigned char *map = NULL; - int mapcount, swapcount = 0; + int swapcount = 0; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return mapcount + swapcount; + return swapcount + page_trans_huge_mapcount(page, NULL); } page = compound_head(page); - _total_mapcount = _total_swapcount = map_swapcount = 0; + _total_swapcount = map_swapcount = 0; if (PageSwapCache(page)) { swp_entry_t entry; @@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, if (map) ci = lock_cluster(si, offset); for (i = 0; i < HPAGE_PMD_NR; i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; - _total_mapcount += mapcount; + int mapcount = atomic_read(&page[i]._mapcount) + 1; if (map) { swapcount = swap_count(map[offset + i]); _total_swapcount += swapcount; @@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, map_swapcount = max(map_swapcount, mapcount + swapcount); } unlock_cluster(ci); - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) map_swapcount -= 1; - _total_mapcount -= HPAGE_PMD_NR; - } - mapcount = compound_mapcount(page); - map_swapcount += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; + if (total_swapcount) *total_swapcount = _total_swapcount; - return map_swapcount; + return map_swapcount + compound_mapcount(page); } /* @@ -1671,13 +1664,12 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, */ bool reuse_swap_page(struct page *page) { - int count, total_mapcount, total_swapcount; + int count, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_map_swapcount(page, &total_mapcount, - &total_swapcount); + count = page_trans_huge_map_swapcount(page, &total_swapcount); if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ -- cgit From d08d2b62510e2407cf939e693aefd179dc114913 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:51 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_mapcount() All callers pass NULL, so we can stop calculating the value we would store in it. Link: https://lkml.kernel.org/r/20211220205943.456187-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++------- include/linux/swap.h | 2 +- mm/huge_memory.c | 30 ++++++++++-------------------- mm/swapfile.c | 2 +- 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d7245e6802a..cef65f9cbdf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -799,19 +799,15 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); +int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) +static inline int page_trans_huge_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index bdccbf1efa61..1d38d9475c4d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -681,7 +681,7 @@ static inline int swp_swapcount(swp_entry_t entry) } #define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page, NULL) == 1) + (page_trans_huge_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b61fbe95c856..6ed86a8f6a5b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2542,38 +2542,28 @@ int total_mapcount(struct page *page) * need full accuracy to avoid breaking page pinning, because * page_trans_huge_mapcount() is slower than page_mapcount(). */ -int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +int page_trans_huge_mapcount(struct page *page) { - int i, ret, _total_mapcount, mapcount; + int i, ret; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; - } + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1; page = compound_head(page); - _total_mapcount = ret = 0; + ret = 0; for (i = 0; i < thp_nr_pages(page); i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; + int mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); - _total_mapcount += mapcount; } - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) ret -= 1; - _total_mapcount -= thp_nr_pages(page); - } - mapcount = compound_mapcount(page); - ret += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; - return ret; + + return ret + compound_mapcount(page); } /* Racy check whether the huge page can be split */ diff --git a/mm/swapfile.c b/mm/swapfile.c index a93f5b5fc8b6..caa9f81a0d15 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1619,7 +1619,7 @@ static int page_trans_huge_map_swapcount(struct page *page, swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return swapcount + page_trans_huge_mapcount(page, NULL); + return swapcount + page_trans_huge_mapcount(page); } page = compound_head(page); -- cgit From cc6266f0322fa9f7f4543564759e881d989ad866 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 14 Jan 2022 14:06:54 -0800 Subject: mm/dmapool.c: revert "make dma pool to use kmalloc_node" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2618c60b8b5836 ("dma: make dma pool to use kmalloc_node"). While working myself into the dmapool code I've found this little odd kmalloc_node(). What basically happens here is that we allocate the housekeeping structure on the numa node where the device is attached to. Since the device is never doing DMA to or from that memory this doesn't seem to make sense at all. So while this doesn't seem to cause much harm it's probably cleaner to revert the change for consistency. Link: https://lkml.kernel.org/r/20211221110724.97664-1-christian.koenig@amd.com Signed-off-by: Christian König Cc: Yinghai Lu Cc: Andi Kleen Cc: Christoph Lameter Cc: David Rientjes Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 64b537b3ccb0..a7eb5d0eb2da 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + retval = kmalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; -- cgit From 451769ebb7e792c3404db53b3c2a422990de654e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:06:57 -0800 Subject: mm/vmalloc: alloc GFP_NO{FS,IO} for vmalloc Patch series "extend vmalloc support for constrained allocations", v2. Based on a recent discussion with Dave and Neil [1] I have tried to implement NOFS, NOIO, NOFAIL support for the vmalloc to make life of kvmalloc users easier. A requirement for NOFAIL support for kvmalloc was new to me but this seems to be really needed by the xfs code. NOFS/NOIO was a known and a long term problem which was hoped to be handled by the scope API. Those scope should have been used at the reclaim recursion boundaries both to document them and also to remove the necessity of NOFS/NOIO constrains for all allocations within that scope. Instead workarounds were developed to wrap a single allocation instead (like ceph_kvmalloc). First patch implements NOFS/NOIO support for vmalloc. The second one adds NOFAIL support and the third one bundles all together into kvmalloc and drops ceph_kvmalloc which can use kvmalloc directly now. [1] http://lkml.kernel.org/r/163184741778.29351.16920832234899124642.stgit@noble.brown This patch (of 4): vmalloc historically hasn't supported GFP_NO{FS,IO} requests because page table allocations do not support externally provided gfp mask and performed GFP_KERNEL like allocations. Since few years we have scope (memalloc_no{fs,io}_{save,restore}) APIs to enforce NOFS and NOIO constrains implicitly to all allocators within the scope. There was a hope that those scopes would be defined on a higher level when the reclaim recursion boundary starts/stops (e.g. when a lock required during the memory reclaim is required etc.). It seems that not all NOFS/NOIO users have adopted this approach and instead they have taken a workaround approach to wrap a single [k]vmalloc allocation by a scope API. These workarounds do not serve the purpose of a better reclaim recursion documentation and reduction of explicit GFP_NO{FS,IO} usege so let's just provide them with the semantic they are asking for without a need for workarounds. Add support for GFP_NOFS and GFP_NOIO to vmalloc directly. All internal allocations already comply with the given gfp_mask. The only current exception is vmap_pages_range which maps kernel page tables. Infer the proper scope API based on the given gfp mask. [sfr@canb.auug.org.au: mm/vmalloc.c needs linux/sched/mm.h] Link: https://lkml.kernel.org/r/20211217232641.0148710c@canb.auug.org.au Link: https://lkml.kernel.org/r/20211122153233.9924-1-mhocko@kernel.org Link: https://lkml.kernel.org/r/20211122153233.9924-2-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Stephen Rothwell Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Neil Brown Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jeff Layton Cc: Dave Chinner Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 80c6de4c425f..8ed93510e50e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2928,6 +2929,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; + unsigned int flags; + int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; @@ -2976,8 +2979,24 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - if (vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift) < 0) { + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + ret = vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); -- cgit From 9376130c390a76fac2788a5d6e1a149017b4ab50 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:01 -0800 Subject: mm/vmalloc: add support for __GFP_NOFAIL Dave Chinner has mentioned that some of the xfs code would benefit from kvmalloc support for __GFP_NOFAIL because they have allocations that cannot fail and they do not fit into a single page. The large part of the vmalloc implementation already complies with the given gfp flags so there is no work for those to be done. The area and page table allocations are an exception to that. Implement a retry loop for those. Add a short sleep before retrying. 1 jiffy is a completely random timeout. Ideally the retry would wait for an explicit event - e.g. a change to the vmalloc space change if the failure was caused by the space fragmentation or depletion. But there are multiple different reasons to retry and this could become much more complex. Keep the retry simple for now and just sleep to prevent from hogging CPUs. Link: https://lkml.kernel.org/r/20211122153233.9924-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8ed93510e50e..d1bebd197c8c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2847,6 +2847,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { + gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; + while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2864,12 +2866,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolcy want to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(gfp, + nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(gfp, nid, + nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -2924,6 +2926,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t orig_gfp_mask = gfp_mask; + bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; @@ -2988,8 +2991,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); - ret = vmap_pages_range(addr, addr + size, prot, area->pages, + do { + ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); @@ -3084,9 +3091,14 @@ again: VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { + bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, vm_struct allocation failed", - real_size); + "vmalloc error: size %lu, vm_struct allocation failed%s", + real_size, (nofail) ? ". Retrying." : ""); + if (nofail) { + schedule_timeout_uninterruptible(1); + goto again; + } goto fail; } -- cgit From 30d3f01191d305c99e8b3f8b1b328fc852270c95 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:04 -0800 Subject: mm/vmalloc: be more explicit about supported gfp flags. Commit b7d90e7a5ea8 ("mm/vmalloc: be more explicit about supported gfp flags") has been merged prematurely without the rest of the series and without addressed review feedback from Neil. Fix that up now. Only wording is changed slightly. Link: https://lkml.kernel.org/r/20211122153233.9924-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d1bebd197c8c..4165304d3547 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3031,12 +3031,14 @@ fail: * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp - * flags are not supported. GFP_KERNEL would be a preferred allocation mode - * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not - * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka - * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka - * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported). - * __GFP_NOWARN can be used to suppress error messages about failures. + * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all + * supported. + * Zone modifiers are not supported. From the reclaim modifiers + * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) + * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and + * __GFP_RETRY_MAYFAIL are not supported). + * + * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. -- cgit From a421ef303008b0ceee2cfc625c3246fa7654b0ca Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:07 -0800 Subject: mm: allow !GFP_KERNEL allocations for kvmalloc Support for GFP_NO{FS,IO} and __GFP_NOFAIL has been implemented by previous patches so we can allow the support for kvmalloc. This will allow some external users to simplify or completely remove their helpers. GFP_NOWAIT semantic hasn't been supported so far but it hasn't been explicitly documented so let's add a note about that. ceph_kvmalloc is the first helper to be dropped and changed to kvmalloc. Link: https://lkml.kernel.org/r/20211122153233.9924-5-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ceph/libceph.h | 1 - mm/util.c | 15 ++++----------- net/ceph/buffer.c | 4 ++-- net/ceph/ceph_common.c | 27 --------------------------- net/ceph/crypto.c | 2 +- net/ceph/messenger.c | 2 +- net/ceph/messenger_v2.c | 2 +- net/ceph/osdmap.c | 12 ++++++------ 8 files changed, 15 insertions(+), 50 deletions(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..309acbcb5a8a 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern void *ceph_kvmalloc(size_t size, gfp_t flags); struct fs_parameter; struct fc_log; diff --git a/mm/util.c b/mm/util.c index 741ba32a43ac..7e43369064c8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -549,13 +549,10 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not - * fall back to vmalloc. - * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) @@ -563,13 +560,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) gfp_t kmalloc_flags = flags; void *ret; - /* - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) - * so the given set of flags has to be compatible. - */ - if ((flags & GFP_KERNEL) != GFP_KERNEL) - return kmalloc_node(size, flags, node); - /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore @@ -582,6 +572,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node(size, kmalloc_flags, node); diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 5622763ad402..7e51f128045d 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -7,7 +7,7 @@ #include #include -#include /* for ceph_kvmalloc */ +#include /* for kvmalloc */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -17,7 +17,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = ceph_kvmalloc(len, gfp); + b->vec.iov_base = kvmalloc(len, gfp); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97d6ea763e32..9441b4a4912b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,33 +190,6 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); -/* - * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are - * compatible with (a superset of) GFP_KERNEL. This is because while the - * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. - * - * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. - */ -void *ceph_kvmalloc(size_t size, gfp_t flags) -{ - void *p; - - if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { - p = kvmalloc(size, flags); - } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { - unsigned int nofs_flag = memalloc_nofs_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - } else { - unsigned int noio_flag = memalloc_noio_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_noio_restore(noio_flag); - } - - return p; -} - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 92d89b331645..051d22c0e4ad 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -147,7 +147,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) static const u8 *aes_iv = (u8 *)CEPH_AES_IV; /* - * Should be used for buffers allocated with ceph_kvmalloc(). + * Should be used for buffers allocated with kvmalloc(). * Currently these are encrypt out-buffer (ceph_buffer) and decrypt * in-buffer (msg front). * diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57d043b382ed..7b891be799d2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1920,7 +1920,7 @@ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, /* front */ if (front_len) { - m->front.iov_base = ceph_kvmalloc(front_len, flags); + m->front.iov_base = kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index cc40ce4e02fb..c4099b641b38 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -308,7 +308,7 @@ static void *alloc_conn_buf(struct ceph_connection *con, int len) if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) return NULL; - buf = ceph_kvmalloc(len, GFP_NOIO); + buf = kvmalloc(len, GFP_NOIO); if (!buf) return NULL; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 75b738083523..2823bb3cff55 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -980,7 +980,7 @@ static struct crush_work *alloc_workspace(const struct crush_map *c) work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - work = ceph_kvmalloc(work_size, GFP_NOIO); + work = kvmalloc(work_size, GFP_NOIO); if (!work) return NULL; @@ -1190,9 +1190,9 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (max == map->max_osd) return 0; - state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); - weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); - addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); if (!state || !weight || !addr) { kvfree(state); kvfree(weight); @@ -1222,7 +1222,7 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + affinity = kvmalloc(array_size(max, sizeof(*affinity)), GFP_NOFS); if (!affinity) return -ENOMEM; @@ -1503,7 +1503,7 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = ceph_kvmalloc( + map->osd_primary_affinity = kvmalloc( array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), GFP_NOFS); if (!map->osd_primary_affinity) -- cgit From 704687deaae768a818d7da0584ee021793a97684 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:11 -0800 Subject: mm: make slab and vmalloc allocators __GFP_NOLOCKDEP aware sl?b and vmalloc allocators reduce the given gfp mask for their internal needs. For that they use GFP_RECLAIM_MASK to preserve the reclaim behavior and constrains. __GFP_NOLOCKDEP is not a part of that mask because it doesn't really control the reclaim behavior strictly speaking. On the other hand it tells the underlying page allocator to disable reclaim recursion detection so arguably it should be part of the mask. Having __GFP_NOLOCKDEP in the mask will not alter the behavior in any form so this change is safe pretty much by definition. It also adds a support for this flag to SL?B and vmalloc allocators which will in turn allow its use to kvmalloc as well. A lack of the support has been noticed recently in http://lkml.kernel.org/r/20211119225435.GZ449541@dread.disaster.area Link: https://lkml.kernel.org/r/YZ9XtLY4AEjVuiEI@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Sebastian Andrzej Siewior Acked-by: Dave Chinner Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index bdac62e1eca7..c5834cc28a44 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -21,7 +21,7 @@ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC) + __GFP_ATOMIC|__GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) -- cgit From 4034247a0d6ab281ba3293798ce67af494d86129 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 14 Jan 2022 14:07:14 -0800 Subject: mm: introduce memalloc_retry_wait() Various places in the kernel - largely in filesystems - respond to a memory allocation failure by looping around and re-trying. Some of these cannot conveniently use __GFP_NOFAIL, for reasons such as: - a GFP_ATOMIC allocation, which __GFP_NOFAIL doesn't work on - a need to check for the process being signalled between failures - the possibility that other recovery actions could be performed - the allocation is quite deep in support code, and passing down an extra flag to say if __GFP_NOFAIL is wanted would be clumsy. Many of these currently use congestion_wait() which (in almost all cases) simply waits the given timeout - congestion isn't tracked for most devices. It isn't clear what the best delay is for loops, but it is clear that the various filesystems shouldn't be responsible for choosing a timeout. This patch introduces memalloc_retry_wait() with takes on that responsibility. Code that wants to retry a memory allocation can call this function passing the GFP flags that were used. It will wait however is appropriate. For now, it only considers __GFP_NORETRY and whatever gfpflags_allow_blocking() tests. If blocking is allowed without __GFP_NORETRY, then alloc_page either made some reclaim progress, or waited for a while, before failing. So there is no need for much further waiting. memalloc_retry_wait() will wait until the current jiffie ends. If this condition is not met, then alloc_page() won't have waited much if at all. In that case memalloc_retry_wait() waits about 200ms. This is the delay that most current loops uses. linux/sched/mm.h needs to be included in some files now, but linux/backing-dev.h does not. Link: https://lkml.kernel.org/r/163754371968.13692.1277530886009912421@noble.neil.brown.name Signed-off-by: NeilBrown Cc: Dave Chinner Cc: Michal Hocko Cc: "Theodore Ts'o" Cc: Jaegeuk Kim Cc: Chao Yu Cc: Darrick J. Wong Cc: Chuck Lever Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/extents.c | 8 +++----- fs/ext4/inline.c | 5 ++--- fs/ext4/page-io.c | 9 +++++---- fs/f2fs/data.c | 4 ++-- fs/f2fs/gc.c | 5 ++--- fs/f2fs/inode.c | 4 ++-- fs/f2fs/node.c | 4 ++-- fs/f2fs/recovery.c | 6 +++--- fs/f2fs/segment.c | 9 +++------ fs/f2fs/super.c | 5 ++--- fs/xfs/kmem.c | 3 +-- fs/xfs/xfs_buf.c | 2 +- include/linux/sched/mm.h | 26 ++++++++++++++++++++++++++ net/sunrpc/svc_xprt.c | 3 ++- 14 files changed, 56 insertions(+), 37 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0ecf819bf189..5582fba36b44 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -4407,8 +4407,7 @@ retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) @@ -4416,8 +4415,7 @@ retry: retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 39a1ab129fdc..635bcf68a67e 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include "ext4_jbd2.h" #include "ext4.h" @@ -1929,8 +1929,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) retry: err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9cb261714991..1d370364230e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { - gfp_flags = GFP_NOFS; + gfp_t new_gfp_flags = GFP_NOFS; if (io->io_bio) ext4_io_submit(io); else - gfp_flags |= __GFP_NOFAIL; - congestion_wait(BLK_RW_ASYNC, HZ/50); + new_gfp_flags |= __GFP_NOFAIL; + memalloc_retry_wait(gfp_flags); + gfp_flags = new_gfp_flags; goto retry_encrypt; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9f754aaef558..aacf5e4dcc57 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -8,9 +8,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -2542,7 +2542,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a946ce0ead34..374bbb5294d9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -7,7 +7,6 @@ */ #include #include -#include #include #include #include @@ -15,6 +14,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -1375,8 +1375,7 @@ retry: if (err) { clear_page_private_gcing(page); if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0f8b2df3e1e0..4c11254a07d4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include "f2fs.h" #include "node.h" @@ -562,7 +562,7 @@ retry: inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 556fcd8457f3..219506ca9a97 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -2750,7 +2750,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6a1b4668d933..d1664a0567ef 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "segment.h" @@ -587,7 +588,7 @@ retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_dn; } goto out; @@ -670,8 +671,7 @@ retry_prev: err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_prev; } goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df9ed75f0b7a..40fdb4a8daeb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -245,9 +246,7 @@ retry: LOOKUP_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } err = -EAGAIN; @@ -424,9 +423,7 @@ retry: err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } unlock_page(page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 040b6d02e1d8..3bace24f8800 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -8,9 +8,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -2415,8 +2415,7 @@ repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 6f49bf39183c..c557a030acfe 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include #include "xfs_message.h" #include "xfs_trace.h" @@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, (unsigned int)size, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(lflags); } while (1); } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 631c5a61d89b..6c45e3fa56f4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -394,7 +394,7 @@ xfs_buf_alloc_pages( } XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ / 50); + memalloc_retry_wait(gfp_mask); } return 0; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index aca874d33fe6..aa5f09ca5bcf 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -214,6 +214,32 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif +/* Any memory-allocation retry loop should use + * memalloc_retry_wait(), and pass the flags for the most + * constrained allocation attempt that might have failed. + * This provides useful documentation of where loops are, + * and a central place to fine tune the waiting as the MM + * implementation changes. + */ +static inline void memalloc_retry_wait(gfp_t gfp_flags) +{ + /* We use io_schedule_timeout because waiting for memory + * typically included waiting for dirty pages to be + * written out, which requires IO. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + gfp_flags = current_gfp_context(gfp_flags); + if (gfpflags_allow_blocking(gfp_flags) && + !(gfp_flags & __GFP_NORETRY)) + /* Probably waited already, no need for much more */ + io_schedule_timeout(1); + else + /* Probably didn't wait, and has now released a lock, + * so now is a good time to wait + */ + io_schedule_timeout(HZ/50); +} + /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 1e99ba1b9d72..9cb18b822ab2 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -688,7 +689,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) return -EINTR; } trace_svc_alloc_arg_err(pages); - schedule_timeout(msecs_to_jiffies(500)); + memalloc_retry_wait(GFP_KERNEL); } rqstp->rq_page_end = &rqstp->rq_pages[pages]; rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ -- cgit From 39c65a94cd9661532be150e88f8b02f4a6844a35 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:07:17 -0800 Subject: mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30% For embedded systems with low total memory, having to run applications with relatively large memory requirements, 10% max limitation for watermark_scale_factor poses an issue of triggering direct reclaim every time such application is started. This results in slow application startup times and bad end-user experience. By increasing watermark_scale_factor max limit we allow vendors more flexibility to choose the right level of kswapd aggressiveness for their device and workload requirements. Link: https://lkml.kernel.org/r/20211124193604.2758863-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mel Gorman Cc: Jonathan Corbet Cc: Zhang Yi Cc: Fengfei Xi Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 2 +- kernel/sysctl.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5e795202111f..f4804ce37c58 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep. The unit is in fractions of 10,000. The default value of 10 means the distances between watermarks are 0.1% of the available memory in the -node/system. The maximum value is 1000, or 10% of memory. +node/system. The maximum value is 3000, or 30% of memory. A high rate of threads entering direct reclaim (allocstall) or kswapd going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..2ab4edb6e450 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -122,6 +122,7 @@ static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int two_hundred = 200; static int one_thousand = 1000; +static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -2959,7 +2960,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, + .extra2 = &three_thousand, }, { .procname = "percpu_pagelist_high_fraction", -- cgit From 1611f74a94ba2e0f2d25b75008ed8e76e122097a Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Fri, 14 Jan 2022 14:07:21 -0800 Subject: mm: fix boolreturn.cocci warning Return statements in functions returning bool should use true/false instead of 1/0. Link: https://lkml.kernel.org/r/20211126073327.74815-1-deng.changcheng@zte.com.cn Signed-off-by: Changcheng Deng Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b5f14d581113..18423c2157e8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -383,7 +383,7 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname, lname) \ -static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ +static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ -- cgit From ca831f29f8f25c97182e726429b38c0802200c8f Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Fri, 14 Jan 2022 14:07:24 -0800 Subject: mm: page_alloc: fix building error on -Werror=array-compare Arthur Marsh reported we would hit the error below when building kernel with gcc-12: CC mm/page_alloc.o mm/page_alloc.c: In function `mem_init_print_info': mm/page_alloc.c:8173:27: error: comparison between two arrays [-Werror=array-compare] 8173 | if (start <= pos && pos < end && size > adj) \ | In C++20, the comparision between arrays should be warned. Link: https://lkml.kernel.org/r/20211125130928.32465-1-sxwjean@me.com Signed-off-by: Xiongwei Song Reported-by: Arthur Marsh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 806f317c2e7e..c4ef450ac442 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8228,7 +8228,7 @@ void __init mem_init_print_info(void) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) -- cgit From be1a13eb51077b2ec5f7f4306f93dfece503a3f1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:27 -0800 Subject: mm: drop node from alloc_pages_vma alloc_pages_vma is meant to allocate a page with a vma specific memory policy. The initial node parameter is always a local node so it is pointless to waste a function argument for this. Drop the parameter. Link: https://lkml.kernel.org/r/YaSnlv4QpryEpesG@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 ++++---- mm/mempolicy.c | 3 ++- mm/shmem.c | 3 +-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8fcc38467af6..78b58448f796 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -598,9 +598,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); + bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -610,14 +610,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 679f47b3a079..ed7d15acb6a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2084,9 +2084,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, bool hugepage) { struct mempolicy *pol; + int node = numa_node_id(); struct page *page; int preferred_nid; nodemask_t *nmask; diff --git a/mm/shmem.c b/mm/shmem.c index 8f940552c182..0700e9acf53b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1564,8 +1564,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), - true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit From 04a536bfbd0f885338eecc2a4503dfca50ac94dd Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Fri, 14 Jan 2022 14:07:30 -0800 Subject: include/linux/gfp.h: further document GFP_DMA32 kmalloc(..., GFP_DMA32) does not return DMA32 memory because the DMA32 kmalloc cache array is not implemented. (Reason: there is no such user in kernel). Put a short comment about this so people can understand this by reading the comment. [1] https://lists.linuxfoundation.org/pipermail/iommu/2018-December/031696.html Link: https://lkml.kernel.org/r/20211207093610.6406-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 78b58448f796..80f63c862be5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -302,7 +302,9 @@ struct vm_area_struct; * lowest zone as a type of emergency reserve. * * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit - * address. + * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory + * because the DMA32 kmalloc cache array is not implemented. + * (Reason: there is no such user in kernel). * * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, * do not need to be directly accessible by the kernel but that cannot -- cgit From eaab8e753632b8e961701d02a5bb398c820f309c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 14 Jan 2022 14:07:33 -0800 Subject: mm/page_alloc.c: modify the comment section for alloc_contig_pages() Clarify that the alloc_contig_pages() allocated range will always be aligned to the requested nr_pages. Link: https://lkml.kernel.org/r/1639545478-12160-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c4ef450ac442..3eace0065ecc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9272,8 +9272,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * for allocation requests which can not be fulfilled with the buddy allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a - * power of two then the alignment is guaranteed to be to the given nr_pages - * (e.g. 1GB request would be aligned to 1GB). + * power of two, then allocated range is also guaranteed to be aligned to same + * nr_pages (e.g. 1GB request would be aligned to 1GB). * * Allocated pages can be freed with free_contig_range() or by manually calling * __free_page() on each allocated page. -- cgit From 62b3107073646e0946bd97ff926832bafb846d17 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:37 -0800 Subject: mm_zone: add function to check if managed dma zone exists Patch series "Handle warning of allocation failure on DMA zone w/o managed pages", v4. **Problem observed: On x86_64, when crash is triggered and entering into kdump kernel, page allocation failure can always be seen. --------------------------------- DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 1 Comm: swapper/0 Call Trace: dump_stack+0x7f/0xa1 warn_alloc.cold+0x72/0xd6 ...... __alloc_pages+0x24d/0x2c0 ...... dma_atomic_pool_init+0xdb/0x176 do_one_initcall+0x67/0x320 ? rcu_read_lock_sched_held+0x3f/0x80 kernel_init_freeable+0x290/0x2dc ? rest_init+0x24f/0x24f kernel_init+0xa/0x111 ret_from_fork+0x22/0x30 Mem-Info: ------------------------------------ ***Root cause: In the current kernel, it assumes that DMA zone must have managed pages and try to request pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that this low 1M won't be added into buddy allocator to become managed pages of DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. ***Investigation: This failure happens since below commit merged into linus's tree. 1a6a9044b967 x86/setup: Remove CONFIG_X86_RESERVE_LOW and reservelow= options 23721c8e92f7 x86/crash: Remove crash_reserve_low_1M() f1d4d47c5851 x86/setup: Always reserve the first 1M of RAM 7c321eb2b843 x86/kdump: Remove the backup region handling 6f599d84231f x86/kdump: Always reserve the low 1M when the crashkernel option is specified Before them, on x86_64, the low 640K area will be reused by kdump kernel. So in kdump kernel, the content of low 640K area is copied into a backup region for dumping before jumping into kdump. Then except of those firmware reserved region in [0, 640K], the left area will be added into buddy allocator to become available managed pages of DMA zone. However, after above commits applied, in kdump kernel of x86_64, the low 1M is reserved by memblock, but not released to buddy allocator. So any later page allocation requested from DMA zone will fail. At the beginning, if crashkernel is reserved, the low 1M need be locked down because AMD SME encrypts memory making the old backup region mechanims impossible when switching into kdump kernel. Later, it was also observed that there are BIOSes corrupting memory under 1M. To solve this, in commit f1d4d47c5851, the entire region of low 1M is always reserved after the real mode trampoline is allocated. Besides, recently, Intel engineer mentioned their TDX (Trusted domain extensions) which is under development in kernel also needs to lock down the low 1M. So we can't simply revert above commits to fix the page allocation failure from DMA zone as someone suggested. ***Solution: Currently, only DMA atomic pool and dma-kmalloc will initialize and request page allocation with GFP_DMA during bootup. So only initializ DMA atomic pool when DMA zone has available managed pages, otherwise just skip the initialization. For dma-kmalloc(), for the time being, let's mute the warning of allocation failure if requesting pages from DMA zone while no manged pages. Meanwhile, change code to use dma_alloc_xx/dma_map_xx API to replace kmalloc(GFP_DMA), or do not use GFP_DMA when calling kmalloc() if not necessary. Christoph is posting patches to fix those under drivers/scsi/. Finally, we can remove the need of dma-kmalloc() as people suggested. This patch (of 3): In some places of the current kernel, it assumes that dma zone must have managed pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that there's no managed pages at all in DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. Here add function has_managed_dma() and the relevant helper functions to check if there's DMA zone with managed pages. It will be used in later patches. Link: https://lkml.kernel.org/r/20211223094435.248523-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20211223094435.248523-2-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Reviewed-by: David Hildenbrand Acked-by: John Donnelly Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: David Laight Cc: Borislav Petkov Cc: Marek Szyprowski Cc: Robin Murphy Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++++ mm/page_alloc.c | 15 +++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 936dc0b6c226..aed44e9b5d89 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1047,6 +1047,15 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); +#else +static inline bool has_managed_dma(void) +{ + return false; +} +#endif + /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3eace0065ecc..e2ad2303f634 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9518,3 +9518,18 @@ bool take_page_off_buddy(struct page *page) return ret; } #endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ -- cgit From a674e48c5443d12a8a43c3ac42367aa39505d506 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:41 -0800 Subject: dma/pool: create dma atomic pool only if dma zone has managed pages Currently three dma atomic pools are initialized as long as the relevant kernel codes are built in. While in kdump kernel of x86_64, this is not right when trying to create atomic_pool_dma, because there's no managed pages in DMA zone. In the case, DMA zone only has low 1M memory presented and locked down by memblock allocator. So no pages are added into buddy of DMA zone. Please check commit f1d4d47c5851 ("x86/setup: Always reserve the first 1M of RAM"). Then in kdump kernel of x86_64, it always prints below failure message: DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-0.rc5.20210611git929d931f2b40.42.fc35.x86_64 #1 Hardware name: Dell Inc. PowerEdge R910/0P658H, BIOS 2.12.0 06/04/2018 Call Trace: dump_stack+0x7f/0xa1 warn_alloc.cold+0x72/0xd6 __alloc_pages_slowpath.constprop.0+0xf29/0xf50 __alloc_pages+0x24d/0x2c0 alloc_page_interleave+0x13/0xb0 atomic_pool_expand+0x118/0x210 __dma_atomic_pool_init+0x45/0x93 dma_atomic_pool_init+0xdb/0x176 do_one_initcall+0x67/0x320 kernel_init_freeable+0x290/0x2dc kernel_init+0xa/0x111 ret_from_fork+0x22/0x30 Mem-Info: ...... DMA: failed to allocate 128 KiB GFP_KERNEL|GFP_DMA pool for atomic allocation DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations Here, let's check if DMA zone has managed pages, then create atomic_pool_dma if yes. Otherwise just skip it. Link: https://lkml.kernel.org/r/20211223094435.248523-3-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Acked-by: John Donnelly Reviewed-by: David Hildenbrand Cc: Marek Szyprowski Cc: Robin Murphy Cc: Borislav Petkov Cc: Christoph Lameter Cc: David Laight Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma/pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5f84e6cdb78e..4d40dcce7604 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void) GFP_KERNEL); if (!atomic_pool_kernel) ret = -ENOMEM; - if (IS_ENABLED(CONFIG_ZONE_DMA)) { + if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) @@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) if (prev == NULL) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) return atomic_pool_dma32; - if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + if (atomic_pool_dma && (gfp & GFP_DMA)) return atomic_pool_dma; return atomic_pool_kernel; } -- cgit From c4dc63f0032c77464fbd4e7a6afc22fa6913c4a7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:44 -0800 Subject: mm/page_alloc.c: do not warn allocation failure on zone DMA if no managed pages In kdump kernel of x86_64, page allocation failure is observed: kworker/u2:2: page allocation failure: order:0, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 55 Comm: kworker/u2:2 Not tainted 5.16.0-rc4+ #5 Hardware name: AMD Dinar/Dinar, BIOS RDN1505B 06/05/2013 Workqueue: events_unbound async_run_entry_fn Call Trace: dump_stack_lvl+0x48/0x5e warn_alloc.cold+0x72/0xd6 __alloc_pages_slowpath.constprop.0+0xc69/0xcd0 __alloc_pages+0x1df/0x210 new_slab+0x389/0x4d0 ___slab_alloc+0x58f/0x770 __slab_alloc.constprop.0+0x4a/0x80 kmem_cache_alloc_trace+0x24b/0x2c0 sr_probe+0x1db/0x620 ...... device_add+0x405/0x920 ...... __scsi_add_device+0xe5/0x100 ata_scsi_scan_host+0x97/0x1d0 async_run_entry_fn+0x30/0x130 process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x350/0x350 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Mem-Info: ...... The above failure happened when calling kmalloc() to allocate buffer with GFP_DMA. It requests to allocate slab page from DMA zone while no managed pages at all in there. sr_probe() --> get_capabilities() --> buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); Because in the current kernel, dma-kmalloc will be created as long as CONFIG_ZONE_DMA is enabled. However, kdump kernel of x86_64 doesn't have managed pages on DMA zone since commit 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified"). The failure can be always reproduced. For now, let's mute the warning of allocation failure if requesting pages from DMA zone while no managed pages. [akpm@linux-foundation.org: fix warning] Link: https://lkml.kernel.org/r/20211223094435.248523-4-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Acked-by: John Donnelly Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Borislav Petkov Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Laight Cc: Marek Szyprowski Cc: Robin Murphy Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2ad2303f634..99fc65c532f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4218,7 +4218,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; va_start(args, fmt); -- cgit From f47761999052b1cc987dd3e3d3adf47997358fc0 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 14 Jan 2022 14:07:48 -0800 Subject: hugetlb: add hugetlb.*.numa_stat file For hugetlb backed jobs/VMs it's critical to understand the numa information for the memory backing these jobs to deliver optimal performance. Currently this technically can be queried from /proc/self/numa_maps, but there are significant issues with that. Namely: 1. Memory can be mapped or unmapped. 2. numa_maps are per process and need to be aggregated across all processes in the cgroup. For shared memory this is more involved as the userspace needs to make sure it doesn't double count shared mappings. 3. I believe querying numa_maps needs to hold the mmap_lock which adds to the contention on this lock. For these reasons I propose simply adding hugetlb.*.numa_stat file, which shows the numa information of the cgroup similarly to memory.numa_stat. On cgroup-v2: cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 On cgroup-v1: cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 hierarichal_total=2097152 N0=2097152 N1=0 This patch was tested manually by allocating hugetlb memory and querying the hugetlb.*.numa_stat file of the cgroup and its parents. [colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"] Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com [keescook@chromium.org: fix copy/paste array assignment] Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.com Signed-off-by: Mina Almasry Signed-off-by: Colin Ian King Signed-off-by: Kees Cook Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Shuah Khan Cc: Miaohe Lin Cc: Oscar Salvador Cc: Michal Hocko Cc: David Rientjes Cc: Jue Wang Cc: Yang Yao Cc: Joanna Li Cc: Cannon Matthews Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v1/hugetlb.rst | 4 + Documentation/admin-guide/cgroup-v2.rst | 5 + include/linux/hugetlb.h | 4 +- include/linux/hugetlb_cgroup.h | 7 ++ mm/hugetlb_cgroup.c | 133 ++++++++++++++++++++++-- 5 files changed, 141 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst index 338f2c7d7a1c..0fa724d82abb 100644 --- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -29,12 +29,14 @@ Brief summary of control files:: hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb hugetlb..failcnt # show the number of allocation failure due to HugeTLB usage limit + hugetlb..numa_stat # show the numa information of the hugetlb memory charged to this cgroup For a system supporting three hugepage sizes (64k, 32M and 1G), the control files include:: hugetlb.1GB.limit_in_bytes hugetlb.1GB.max_usage_in_bytes + hugetlb.1GB.numa_stat hugetlb.1GB.usage_in_bytes hugetlb.1GB.failcnt hugetlb.1GB.rsvd.limit_in_bytes @@ -43,6 +45,7 @@ files include:: hugetlb.1GB.rsvd.failcnt hugetlb.64KB.limit_in_bytes hugetlb.64KB.max_usage_in_bytes + hugetlb.64KB.numa_stat hugetlb.64KB.usage_in_bytes hugetlb.64KB.failcnt hugetlb.64KB.rsvd.limit_in_bytes @@ -51,6 +54,7 @@ files include:: hugetlb.64KB.rsvd.failcnt hugetlb.32MB.limit_in_bytes hugetlb.32MB.max_usage_in_bytes + hugetlb.32MB.numa_stat hugetlb.32MB.usage_in_bytes hugetlb.32MB.failcnt hugetlb.32MB.rsvd.limit_in_bytes diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4f400b03dddf..5aa368d165da 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2266,6 +2266,11 @@ HugeTLB Interface Files are local to the cgroup i.e. not hierarchical. The file modified event generated on this file reflects only the local events. + hugetlb..numa_stat + Similar to memory.numa_stat, it shows the numa information of the + hugetlb pages of in this cgroup. Only active in + use hugetlb pages are included. The per-node values are in bytes. + Misc ---- diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 00351ccb49a3..d1897a69c540 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -622,8 +622,8 @@ struct hstate { #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ - struct cftype cgroup_files_dfl[7]; - struct cftype cgroup_files_legacy[9]; + struct cftype cgroup_files_dfl[8]; + struct cftype cgroup_files_legacy[10]; #endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index ba025ae27882..379344828e78 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -36,6 +36,11 @@ enum hugetlb_memory_event { HUGETLB_NR_MEMORY_EVENTS, }; +struct hugetlb_cgroup_per_node { + /* hugetlb usage in pages over all hstates. */ + unsigned long usage[HUGE_MAX_HSTATE]; +}; + struct hugetlb_cgroup { struct cgroup_subsys_state css; @@ -57,6 +62,8 @@ struct hugetlb_cgroup { /* Handle for "hugetlb.events.local" */ struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; + + struct hugetlb_cgroup_per_node *nodeinfo[]; }; static inline struct hugetlb_cgroup * diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 79d93534ef1e..f9942841df18 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } } +static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) +{ + int node; + + for_each_node(node) + kfree(h_cgroup->nodeinfo[node]); + kfree(h_cgroup); +} + static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; + int node; + + h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), + GFP_KERNEL); - h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; + /* + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + for_each_node(node) { + /* Set node_to_alloc to -1 for offline nodes. */ + int node_to_alloc = + node_state(node, N_NORMAL_MEMORY) ? node : -1; + h_cgroup->nodeinfo[node] = + kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), + GFP_KERNEL, node_to_alloc); + if (!h_cgroup->nodeinfo[node]) + goto fail_alloc_nodeinfo; + } + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; + +fail_alloc_nodeinfo: + hugetlb_cgroup_free(h_cgroup); + return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { - struct hugetlb_cgroup *h_cgroup; - - h_cgroup = hugetlb_cgroup_from_css(css); - kfree(h_cgroup); + hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* @@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, return; __set_hugetlb_cgroup(page, h_cg, rsvd); - return; + if (!rsvd) { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage + nr_pages); + } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, @@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (rsvd) css_put(&h_cg->css); - - return; + else { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage - nr_pages); + } } void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, @@ -418,6 +466,59 @@ enum { RES_RSVD_FAILCNT, }; +static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) +{ + int nid; + struct cftype *cft = seq_cft(seq); + int idx = MEMFILE_IDX(cft->private); + bool legacy = MEMFILE_ATTR(cft->private); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + struct cgroup_subsys_state *css; + unsigned long usage; + + if (legacy) { + /* Add up usage across all nodes for the non-hierarchical total. */ + usage = 0; + for_each_node_state(nid, N_MEMORY) + usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); + seq_printf(seq, "total=%lu", usage * PAGE_SIZE); + + /* Simply print the per-node usage for the non-hierarchical total. */ + for_each_node_state(nid, N_MEMORY) + seq_printf(seq, " N%d=%lu", nid, + READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * + PAGE_SIZE); + seq_putc(seq, '\n'); + } + + /* + * The hierarchical total is pretty much the value recorded by the + * counter, so use that. + */ + seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", + page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); + + /* + * For each node, transverse the css tree to obtain the hierarchical + * node usage. + */ + for_each_node_state(nid, N_MEMORY) { + usage = 0; + rcu_read_lock(); + css_for_each_descendant_pre(css, &h_cg->css) { + usage += READ_ONCE(hugetlb_cgroup_from_css(css) + ->nodeinfo[nid] + ->usage[idx]); + } + rcu_read_unlock(); + seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); + } + + seq_putc(seq, '\n'); + + return 0; +} + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[7]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, @@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_legacy[8]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 1); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_legacy[9]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, -- cgit From f77a286de48c04f7dd248b41584645afc3613bb1 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 14 Jan 2022 14:07:52 -0800 Subject: mm, hugepages: make memory size variable in hugepage-mremap selftest The hugetlb vma mremap() test currently maps 1GB of memory to trigger pmd sharing and make sure that 'unshare' path in mremap code works. The test originally only mapped 10MB of memory (as specified by the header comment) but was later modified to 1GB to tackle this case. However, not all machines will have 1GB of memory to spare for this test. Adding a mapping size arg will allow run_vmtest.sh to pass an adequate mapping size, while allowing users to run the test independently with arbitrary size mappings. Link: https://lkml.kernel.org/r/20211124203805.3700355-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Cc: Shuah Khan Cc: Mina Almasry Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/hugepage-mremap.c | 46 ++++++++++++++++++---------- tools/testing/selftests/vm/run_vmtests.sh | 2 +- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 257df94697a5..2a7c33631a29 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -4,7 +4,11 @@ * * Example of remapping huge page memory in a user application using the * mremap system call. Code assumes a hugetlbfs filesystem is mounted - * at './huge'. The code will use 10MB worth of huge pages. + * at './huge'. The amount of memory used by this test is decided by a command + * line argument in MBs. If missing, the default amount is 10MB. + * + * To make sure the test triggers pmd sharing and goes through the 'unshare' + * path in the mremap code use 1GB (1024) or more. */ #define _GNU_SOURCE @@ -18,8 +22,10 @@ #include #include -#define LENGTH (1UL * 1024 * 1024 * 1024) +#define DEFAULT_LENGTH_MB 10UL +#define MB_TO_BYTES(x) (x * 1024 * 1024) +#define FILE_NAME "huge/hugepagefile" #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) #define FLAGS (MAP_SHARED | MAP_ANONYMOUS) @@ -28,20 +34,20 @@ static void check_bytes(char *addr) printf("First hex is %x\n", *((unsigned int *)addr)); } -static void write_bytes(char *addr) +static void write_bytes(char *addr, size_t len) { unsigned long i; - for (i = 0; i < LENGTH; i++) + for (i = 0; i < len; i++) *(addr + i) = (char)i; } -static int read_bytes(char *addr) +static int read_bytes(char *addr, size_t len) { unsigned long i; check_bytes(addr); - for (i = 0; i < LENGTH; i++) + for (i = 0; i < len; i++) if (*(addr + i) != (char)i) { printf("Mismatch at %lu\n", i); return 1; @@ -99,11 +105,19 @@ static void register_region_with_uffd(char *addr, size_t len) } } -int main(void) +int main(int argc, char *argv[]) { + /* Read memory length as the first arg if valid, otherwise fallback to + * the default length. Any additional args are ignored. + */ + size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL; + + length = length > 0 ? length : DEFAULT_LENGTH_MB; + length = MB_TO_BYTES(length); + int ret = 0; - int fd = open("/huge/test", O_CREAT | O_RDWR, 0755); + int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); if (fd < 0) { perror("Open failed"); @@ -112,7 +126,7 @@ int main(void) /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ unsigned long suggested_addr = 0x7eaa40000000; - void *haddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + void *haddr = mmap((void *)suggested_addr, length, PROTECTION, MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); printf("Map haddr: Returned address is %p\n", haddr); if (haddr == MAP_FAILED) { @@ -122,7 +136,7 @@ int main(void) /* mmap again to a dummy address to hopefully trigger pmd sharing. */ suggested_addr = 0x7daa40000000; - void *daddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + void *daddr = mmap((void *)suggested_addr, length, PROTECTION, MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); printf("Map daddr: Returned address is %p\n", daddr); if (daddr == MAP_FAILED) { @@ -132,16 +146,16 @@ int main(void) suggested_addr = 0x7faa40000000; void *vaddr = - mmap((void *)suggested_addr, LENGTH, PROTECTION, FLAGS, -1, 0); + mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); printf("Map vaddr: Returned address is %p\n", vaddr); if (vaddr == MAP_FAILED) { perror("mmap2"); exit(1); } - register_region_with_uffd(haddr, LENGTH); + register_region_with_uffd(haddr, length); - void *addr = mremap(haddr, LENGTH, LENGTH, + void *addr = mremap(haddr, length, length, MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); if (addr == MAP_FAILED) { perror("mremap"); @@ -150,10 +164,10 @@ int main(void) printf("Mremap: Returned address is %p\n", addr); check_bytes(addr); - write_bytes(addr); - ret = read_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); - munmap(addr, LENGTH); + munmap(addr, length); return ret; } diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index a24d30af3094..75d401741394 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -111,7 +111,7 @@ fi echo "-----------------------" echo "running hugepage-mremap" echo "-----------------------" -./hugepage-mremap +./hugepage-mremap 256 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 -- cgit From e9ea874a8ffb0f8ebed4f4981531a32c5b663d79 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 14 Jan 2022 14:07:55 -0800 Subject: mm/vmstat: add events for THP max_ptes_* exceeds There are interfaces to adjust max_ptes_none, max_ptes_swap, max_ptes_shared values, see /sys/kernel/mm/transparent_hugepage/khugepaged/. But system administrator may not know which value is the best. So Add those events to support adjusting max_ptes_* to suitable values. For example, if default max_ptes_swap value causes too much failures, and system uses zram whose IO is fast, administrator could increase max_ptes_swap until THP_SCAN_EXCEED_SWAP_PTE not increase anymore. Link: https://lkml.kernel.org/r/20211225094036.574157-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Cc: "Huang, Ying" Cc: Dave Hansen Cc: Minchan Kim Cc: Saravanan D Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 3 +++ mm/khugepaged.c | 7 +++++++ mm/vmstat.c | 3 +++ 3 files changed, 13 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a185cc75ff52..7b2363388bfa 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -98,6 +98,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, + THP_SCAN_EXCEED_NONE_PTE, + THP_SCAN_EXCEED_SWAP_PTE, + THP_SCAN_EXCEED_SHARED_PTE, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 698ea19775ac..02071f213c58 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -618,6 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } @@ -636,6 +637,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } @@ -1253,6 +1255,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } @@ -1262,6 +1265,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } @@ -1290,6 +1294,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } @@ -2000,6 +2005,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; @@ -2046,6 +2052,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (result == SCAN_SUCCEED) { if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { node = khugepaged_find_target_node(); collapse_file(mm, file, start, hpage, node); diff --git a/mm/vmstat.c b/mm/vmstat.c index d701c335628c..4057372745d0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1353,6 +1353,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", + "thp_scan_exceed_none_pte", + "thp_scan_exceed_swap_pte", + "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif -- cgit From 209376ed2a8431ccb4c40fdcef11194fc1e749b0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 14 Jan 2022 14:07:58 -0800 Subject: selftests/vm: make charge_reserved_hugetlb.sh work with existing cgroup setting The hugetlb cgroup reservation test charge_reserved_hugetlb.sh assume that no cgroup filesystems are mounted before running the test. That is not true in many cases. As a result, the test fails to run. Fix that by querying the current cgroup mount setting and using the existing cgroup setup instead before attempting to freshly mount a cgroup filesystem. Similar change is also made for hugetlb_reparenting_test.sh as well, though it still has problem if cgroup v2 isn't used. The patched test scripts were run on a centos 8 based system to verify that they ran properly. Link: https://lkml.kernel.org/r/20220106201359.1646575-1-longman@redhat.com Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests") Signed-off-by: Waiman Long Acked-by: Mina Almasry Cc: Shuah Khan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../selftests/vm/charge_reserved_hugetlb.sh | 34 +++++++++++++--------- .../selftests/vm/hugetlb_reparenting_test.sh | 21 ++++++++----- tools/testing/selftests/vm/write_hugetlb_memory.sh | 2 +- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh index fe8fcfb334e0..a5cb4b09a46c 100644 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh @@ -24,19 +24,23 @@ if [[ "$1" == "-cgroup-v2" ]]; then reservation_usage_file=rsvd.current fi -cgroup_path=/dev/cgroup/memory -if [[ ! -e $cgroup_path ]]; then - mkdir -p $cgroup_path - if [[ $cgroup2 ]]; then +if [[ $cgroup2 ]]; then + cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory mount -t cgroup2 none $cgroup_path - else + do_umount=1 + fi + echo "+hugetlb" >$cgroup_path/cgroup.subtree_control +else + cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory mount -t cgroup memory,hugetlb $cgroup_path + do_umount=1 fi fi - -if [[ $cgroup2 ]]; then - echo "+hugetlb" >/dev/cgroup/memory/cgroup.subtree_control -fi +export cgroup_path function cleanup() { if [[ $cgroup2 ]]; then @@ -108,7 +112,7 @@ function setup_cgroup() { function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" # Wait for hugetlbfs memory to get depleted. while [ $(cat $path) != 0 ]; do echo Waiting for hugetlb memory to get depleted. @@ -121,7 +125,7 @@ function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" # Wait for hugetlbfs memory to get written. while [ $(cat $path) != $size ]; do echo Waiting for hugetlb memory reservation to reach size $size. @@ -134,7 +138,7 @@ function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" # Wait for hugetlbfs memory to get written. while [ $(cat $path) != $size ]; do echo Waiting for hugetlb memory to reach size $size. @@ -574,5 +578,7 @@ for populate in "" "-o"; do done # populate done # method -umount $cgroup_path -rmdir $cgroup_path +if [[ $do_umount ]]; then + umount $cgroup_path + rmdir $cgroup_path +fi diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh index 4a9a3afe9fd4..bf2d2a684edf 100644 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh @@ -18,19 +18,24 @@ if [[ "$1" == "-cgroup-v2" ]]; then usage_file=current fi -CGROUP_ROOT='/dev/cgroup/memory' -MNT='/mnt/huge/' -if [[ ! -e $CGROUP_ROOT ]]; then - mkdir -p $CGROUP_ROOT - if [[ $cgroup2 ]]; then +if [[ $cgroup2 ]]; then + CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory mount -t cgroup2 none $CGROUP_ROOT - sleep 1 - echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control - else + do_umount=1 + fi + echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control +else + CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory mount -t cgroup memory,hugetlb $CGROUP_ROOT + do_umount=1 fi fi +MNT='/mnt/huge/' function get_machine_hugepage_size() { hpz=$(grep -i hugepagesize /proc/meminfo) diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh index d3d0d108924d..70a02301f4c2 100644 --- a/tools/testing/selftests/vm/write_hugetlb_memory.sh +++ b/tools/testing/selftests/vm/write_hugetlb_memory.sh @@ -14,7 +14,7 @@ want_sleep=$8 reserve=$9 echo "Putting task in cgroup '$cgroup'" -echo $$ > /dev/cgroup/memory/"$cgroup"/cgroup.procs +echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs echo "Method is $method" -- cgit From fab51505480058dcb63d973515c748fa6c437cab Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 Jan 2022 14:08:01 -0800 Subject: selftests/uffd: allow EINTR/EAGAIN This allow test to continue with interruptions like gdb. Link: https://lkml.kernel.org/r/20211115135219.85881-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Cc: Andrea Arcangeli Cc: Nadav Amit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 990f7aecf4a3..c9eddd25ba17 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -648,7 +648,7 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg) if (ret != sizeof(*msg)) { if (ret < 0) { - if (errno == EAGAIN) + if (errno == EAGAIN || errno == EINTR) return 1; err("blocking read error"); } else { @@ -724,8 +724,11 @@ static void *uffd_poll_thread(void *arg) for (;;) { ret = poll(pollfd, 2, -1); - if (ret <= 0) + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; err("poll error: %d", ret); + } if (pollfd[1].revents & POLLIN) { if (read(pollfd[1].fd, &tmp_chr, 1) != 1) err("read pipefd error"); -- cgit From 692b55815cf970eb4ce428f48f2c94d1800acc4b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 14 Jan 2022 14:08:04 -0800 Subject: userfaultfd/selftests: clean up hugetlb allocation code The message for commit f5c73297181c ("userfaultfd/selftests: fix hugetlb area allocations") says there is no need to create a hugetlb file in the non-shared testing case. However, the commit did not actually change the code to prevent creation of the file. While it is technically true that there is no need to create and use a hugetlb file in the case of non-shared-testing, it is useful. This is because 'hole punching' of a hugetlb file has the potentially incorrect side effect of also removing pages from private mappings. The userfaultfd test relies on this side effect for removing pages from the destination buffer during rounds of stress testing. Remove the incomplete code that was added to deal with no hugetlb file. Just keep the code that prevents reserves from being created for the destination area. Link: https://lkml.kernel.org/r/20220104021729.111006-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Axel Rasmussen Cc: Peter Xu Cc: Andrea Arcangeli Cc: Mina Almasry Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index c9eddd25ba17..d3fd24f9fae8 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -87,7 +87,7 @@ static bool test_uffdio_minor = false; static bool map_shared; static int shm_fd; -static int huge_fd = -1; /* only used for hugetlb_shared test */ +static int huge_fd; static char *huge_fd_off0; static unsigned long long *count_verify; static int uffd = -1; @@ -223,9 +223,6 @@ static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) static void hugetlb_release_pages(char *rel_area) { - if (huge_fd == -1) - return; - if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, rel_area == huge_fd_off0 ? 0 : nr_pages * page_size, nr_pages * page_size)) @@ -238,17 +235,17 @@ static void hugetlb_allocate_area(void **alloc_area) char **alloc_area_alias; *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - map_shared ? MAP_SHARED : - MAP_PRIVATE | MAP_HUGETLB | + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + MAP_HUGETLB | (*alloc_area == area_src ? 0 : MAP_NORESERVE), - huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + huge_fd, *alloc_area == area_src ? 0 : + nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); if (map_shared) { area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, + MAP_SHARED | MAP_HUGETLB, huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) -- cgit From e4b424b7ec8791087375bb1f2480a3ba05d21e0b Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 14 Jan 2022 14:08:07 -0800 Subject: vmscan: make drop_slab_node static drop_slab_node is only used in drop_slab. So remove it's declaration from header file and add keyword static for it's definition. Link: https://lkml.kernel.org/r/20211111062445.5236-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/vmscan.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index cef65f9cbdf2..eb67eb699b78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3122,7 +3122,6 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, #endif void drop_slab(void); -void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 diff --git a/mm/vmscan.c b/mm/vmscan.c index 700434db5735..090bfb605ecf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -951,7 +951,7 @@ out: return freed; } -void drop_slab_node(int nid) +static void drop_slab_node(int nid) { unsigned long freed; int shift = 0; -- cgit From 721fb891ad0b3956d5c168b2931e3e5e4fb7ca40 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 14 Jan 2022 14:08:10 -0800 Subject: mm/page_isolation: unset migratetype directly for non Buddy page In unset_migratetype_isolate(), we can bypass the call to move_freepages_block() for non-buddy pages. It will save a few cpu cycles for some situations such as cma and hugetlb when allocating continue pages, in these situation function alloc_contig_pages will be called. alloc_contig_pages __alloc_contig_migrate_range isolate_freepages_range ==> pages has been remove from buddy undo_isolate_page_range unset_migratetype_isolate ==> can directly set migratetype [osalvador@suse.de: changelog tweak] Link: https://lkml.kernel.org/r/20211229033649.2760586-1-chenwandun@huawei.com Fixes: 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock") Signed-off-by: Chen Wandun Reviewed-by: Oscar Salvador Cc: Vlastimil Babka Cc: Joonsoo Kim Cc: Wang Kefeng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f67c4c70f17f..6a0ddda6b3c5 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page) { + if (!isolated_page && PageBuddy(page)) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } -- cgit From c04551162167368022a61899843821bbf015b473 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:14 -0800 Subject: mm/mempolicy: use policy_node helper with MPOL_PREFERRED_MANY Patch series "mm: add new syscall set_mempolicy_home_node", v6. This patch (of 3): A followup patch will enable setting a home node with MPOL_PREFERRED_MANY memory policy. To facilitate that switch to using policy_node helper. There is no functional change in this patch. Link: https://lkml.kernel.org/r/20211202123810.267175-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20211202123810.267175-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ed7d15acb6a2..de70f119984a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2062,7 +2062,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); if (!page) - page = __alloc_pages(gfp, order, numa_node_id(), NULL); + page = __alloc_pages(gfp, order, nid, NULL); return page; } @@ -2104,6 +2104,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, } if (pol->mode == MPOL_PREFERRED_MANY) { + node = policy_node(gfp, pol, node); page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); goto out; @@ -2187,7 +2188,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, - numa_node_id(), pol); + policy_node(gfp, pol, numa_node_id()), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), -- cgit From c6018b4b254971863bd0ad36bb5e7d0fa0f0ddb0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:17 -0800 Subject: mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/mm/numa_memory_policy.rst | 16 ++++- include/linux/mempolicy.h | 1 + mm/mempolicy.c | 79 ++++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 64fd0ba0d057..5a6afecbb0d0 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -408,7 +408,7 @@ follows: Memory Policy APIs ================== -Linux supports 3 system calls for controlling memory policy. These APIS +Linux supports 4 system calls for controlling memory policy. These APIS always affect only the calling task, the calling task's address space, or some shared object mapped into the calling task's address space. @@ -460,6 +460,20 @@ requested via the 'flags' argument. See the mbind(2) man page for more details. +Set home node for a Range of Task's Address Spacec:: + + long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); + +sys_set_mempolicy_home_node set the home node for a VMA policy present in the +task's address range. The system call updates the home node only for the existing +mempolicy range. Other address ranges are ignored. A home node is the NUMA node +closest to which page allocation will come from. Specifying the home node override +the default allocation policy to allocate memory close to the local node for an +executing CPU. + + Memory Policy Command Line Interface ==================================== diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c7595e81150..668389b4b53d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -46,6 +46,7 @@ struct mempolicy { unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index de70f119984a..fc6cae7926f3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -296,6 +296,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + policy->home_node = NUMA_NO_NODE; return policy; } @@ -1478,6 +1479,77 @@ static long kernel_mbind(unsigned long start, unsigned long len, return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct mempolicy *new; + unsigned long vmstart; + unsigned long vmend; + unsigned long end; + int err = -ENOENT; + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + vma = find_vma(mm, start); + for (; vma && vma->vm_start < end; vma = vma->vm_next) { + + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) + continue; + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + err = -EOPNOTSUPP; + break; + } + + new->home_node = home_node; + err = mbind_range(mm, vmstart, vmend, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) @@ -1802,6 +1874,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + policy->home_node != NUMA_NO_NODE) + return policy->home_node; + return nd; } @@ -2344,6 +2421,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) return false; if (a->flags != b->flags) return false; + if (a->home_node != b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; -- cgit From 21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:21 -0800 Subject: mm/mempolicy: wire up syscall set_mempolicy_home_node Link: https://lkml.kernel.org/r/20211202123810.267175-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 3 +++ include/uapi/asm-generic/unistd.h | 5 ++++- kernel/sys_ni.c | 1 + 21 files changed, 27 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ca5a32228cd6..3515bc4f16a4 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -489,3 +489,4 @@ # 557 reserved for memfd_secret 558 common process_mrelease sys_process_mrelease 559 common futex_waitv sys_futex_waitv +560 common set_mempolicy_home_node sys_ni_syscall diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 543100151f2b..ac964612d8b0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -463,3 +463,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 6bdb5f5db438..4e65da3445c7 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 450 +#define __NR_compat_syscalls 451 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 41ea1195e44b..604a2053d006 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 707ae121f6d3..78b1d03e86e1 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -370,3 +370,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 45bc32a41b90..b1f3940bc298 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -449,3 +449,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 2204bde3ce4a..820145e47350 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -455,3 +455,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 72d02d363f36..253ff994ed2e 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -388,3 +388,4 @@ # 447 reserved for memfd_secret 448 n32 process_mrelease sys_process_mrelease 449 n32 futex_waitv sys_futex_waitv +450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index e2c481fcede6..3f1886ad9d80 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -364,3 +364,4 @@ # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 3714c97b2643..8f243e35a7b2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -437,3 +437,4 @@ # 447 reserved for memfd_secret 448 o32 process_mrelease sys_process_mrelease 449 o32 futex_waitv sys_futex_waitv +450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 358c00000755..68b46fe2f17c 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -447,3 +447,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 15109af9d075..2600b4237292 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index ed9c5c2eafad..799147658dee 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index d9539d28bdaa..2de85c977f54 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 46adabcb1720..4398cc6fb68d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -495,3 +495,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7e25543693de..320480a8db4f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -454,3 +454,4 @@ 447 i386 memfd_secret sys_memfd_secret 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index fe8f8dd157b4..c84d12608cd2 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,6 +371,7 @@ 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 3e3e1a506bed..52c94ab5c205 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -420,3 +420,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 528a478dbda8..819c0cb00b6d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); asmlinkage long sys_memfd_secret(unsigned int flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 4557a8b6086f..1c48b0ae3ba3 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 451 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d1944258cfc0..a492f159624f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -297,6 +297,7 @@ COND_SYSCALL(get_mempolicy); COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); +COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); -- cgit From dad5b0232949818ae581ebd089c7013e2fdbb093 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 14 Jan 2022 14:08:24 -0800 Subject: mm/mempolicy: fix all kernel-doc warnings Fix kernel-doc warnings in mempolicy.c: mempolicy.c:139: warning: No description found for return value of 'numa_map_to_online_node' mempolicy.c:2165: warning: Excess function parameter 'node' description in 'alloc_pages_vma' mempolicy.c:2973: warning: No description found for return value of 'mpol_parse_str' Link: https://lkml.kernel.org/r/20211213233216.5477-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc6cae7926f3..028e8dd82b44 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. + * + * Return: this @node if it is online, otherwise the closest node by distance */ int numa_map_to_online_node(int node) { @@ -2150,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * @order: Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual address of the allocation. Must be inside @vma. - * @node: Which node to prefer for allocation (modulo policy). * @hugepage: For hugepages try only the preferred node if possible. * * Allocate a page for a specific address in @vma, using the appropriate @@ -2966,7 +2967,7 @@ static const char * const policy_modes[] = * Format of input: * [=][:] * - * On success, returns 0, else 1 + * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { -- cgit From f530243a172d2ff03f88d0056f838928d6445c6d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 14 Jan 2022 14:08:27 -0800 Subject: mm, oom: OOM sysrq should always kill a process The OOM kill sysrq (alt+sysrq+F) should allow the user to kill the process with the highest OOM badness with a single execution. However, at the moment, the OOM kill can bail out if an OOM notifier (e.g. the i915 one) says that it reclaimed a tiny amount of memory from somewhere. That's probably not what the user wants, so skip the bailout if the OOM was triggered via sysrq. Link: https://lkml.kernel.org/r/20220106102605.635656-1-jannh@google.com Signed-off-by: Jann Horn Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3390316c8a32..3934ff500878 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1058,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) + if (freed > 0 && !is_sysrq_oom(oc)) /* Got some memory back in the last second. */ return true; } -- cgit From d6aba4c8e20d4d2bf65d589953f6d891c178f3a3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Jan 2022 14:08:30 -0800 Subject: hugetlbfs: fix off-by-one error in hugetlb_vmdelete_list() Pass "end - 1" instead of "end" when walking the interval tree in hugetlb_vmdelete_list() to fix an inclusive vs. exclusive bug. The two callers that pass a non-zero "end" treat it as exclusive, whereas the interval tree iterator expects an inclusive "last". E.g. punching a hole in a file that precisely matches the size of a single hugepage, with a vma starting right on the boundary, will result in unmap_hugepage_range() being called twice, with the second call having start==end. The off-by-one error doesn't cause functional problems as __unmap_hugepage_range() turns into a massive nop due to short-circuiting its for-loop on "address < end". But, the mmu_notifier invocations to invalid_range_{start,end}() are passed a bogus zero-sized range, which may be unexpected behavior for secondary MMUs. The bug was exposed by commit ed922739c919 ("KVM: Use interval tree to do fast hva lookup in memslots"), currently queued in the KVM tree for 5.17, which added a WARN to detect ranges with start==end. Link: https://lkml.kernel.org/r/20211228234257.1926057-1-seanjc@google.com Fixes: 1bfad99ab425 ("hugetlbfs: hugetlb_vmtruncate_list() needs to take a range to delete") Signed-off-by: Sean Christopherson Reported-by: syzbot+4e697fe80a31aa7efe21@syzkaller.appspotmail.com Reviewed-by: Mike Kravetz Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 49d2e686be74..a7c6c7498be0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) struct vm_area_struct *vma; /* - * end == 0 indicates that the entire range after - * start should be unmapped. + * end == 0 indicates that the entire range after start should be + * unmapped. Note, end is exclusive, whereas the interval tree takes + * an inclusive "last". */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_offset; unsigned long v_end; -- cgit From b5bade978e9b8f42521ccef711642bd21313cf44 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:34 -0800 Subject: mm: migrate: fix the return value of migrate_pages() Patch series "Improve the migration stats". According to talk with Zi Yan [1], this patch set changes the return value of migrate_pages() to avoid returning a number which is larger than the number of pages the users tried to migrate by move_pages() syscall. Also fix the hugetlb migration stats and migration stats in trace_mm_compaction_migratepages(). [1] https://lore.kernel.org/linux-mm/7E44019D-2A5D-4BA7-B4D5-00D4712F1687@nvidia.com/ This patch (of 3): As Zi Yan pointed out, the syscall move_pages() can return a non-migrated number larger than the number of pages the users tried to migrate, when a THP page is failed to migrate. This is confusing for users. Since other migration scenarios do not care about the actual non-migrated number of pages except the memory compaction migration which will fix in following patch. Thus we can change the return value to return the number of {normal page, THP, hugetlb} instead to avoid this issue, and the number of THP splits will be considered as the number of non-migrated THP, no matter how many subpages of the THP are migrated successfully. Meanwhile we should still keep the migration counters using the number of normal pages. Link: https://lkml.kernel.org/r/cover.1636275127.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/6486fabc3e8c66ff613e150af25e89b3147977a6.1636275127.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Zi Yan Co-developed-by: Zi Yan Cc: Steven Rostedt (VMware) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 6aa4b5326784..57bc8b97490c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1421,7 +1421,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of pages migrated successfully if + * @ret_succeeded: Set to the number of normal pages migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more @@ -1429,7 +1429,9 @@ static inline int try_split_thp(struct page *page, struct page **page2, * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of pages that were not migrated, or an error code. + * Returns the number of {normal page, THP} that were not migrated, or an error code. + * The number of THP splits will be considered as the number of non-migrated THP, + * no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@ -1438,6 +1440,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int retry = 1; int thp_retry = 1; int nr_failed = 0; + int nr_failed_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; int nr_thp_failed = 0; @@ -1449,13 +1452,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); + LIST_HEAD(thp_split_pages); bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_subpage_counting = false; trace_mm_migrate_pages_start(mode, reason); if (!swapwrite) current->flags |= PF_SWAPWRITE; +thp_subpage_migration: for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; thp_retry = 0; @@ -1504,18 +1510,20 @@ retry: case -ENOSYS: /* THP migration is unsupported */ if (is_thp) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } /* Hugetlb migration is unsupported */ - nr_failed++; + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages++; break; case -ENOMEM: /* @@ -1524,16 +1532,19 @@ retry: * THP NUMA faulting doesn't split THP to retry. */ if (is_thp && !nosplit) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; goto out; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages++; goto out; case -EAGAIN: if (is_thp) { @@ -1559,17 +1570,37 @@ retry: */ if (is_thp) { nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages++; break; } } } - nr_failed += retry + thp_retry; + nr_failed += retry; nr_thp_failed += thp_retry; - rc = nr_failed; + /* + * Try to migrate subpages of fail-to-migrate THPs, no nr_failed + * counting in this round, since all subpages of a THP is counted + * as 1 failure in the first round. + */ + if (!list_empty(&thp_split_pages)) { + /* + * Move non-migrated pages (after 10 retries) to ret_pages + * to avoid migrating them again. + */ + list_splice_init(from, &ret_pages); + list_splice_init(&thp_split_pages, from); + no_subpage_counting = true; + retry = 1; + goto thp_subpage_migration; + } + + rc = nr_failed + nr_thp_failed; out: /* * Put the permanent failure page back to migration list, they @@ -1578,11 +1609,11 @@ out: list_splice(&ret_pages, from); count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, nr_thp_failed, nr_thp_split, mode, reason); if (!swapwrite) -- cgit From 5d39a7ebc8be70e30176aed6f98f799bfa7439d6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:37 -0800 Subject: mm: migrate: correct the hugetlb migration stats Correct the migration stats for hugetlb with using compound_nr() instead of thp_nr_pages(), meanwhile change 'nr_failed_pages' to record the number of normal pages failed to migrate, including THP and hugetlb, and 'nr_succeeded' will record the number of normal pages migrated successfully. [baolin.wang@linux.alibaba.com: fix docs, per Mike] Link: https://lkml.kernel.org/r/141bdfc6-f898-3cc3-f692-726c5f6cb74d@linux.alibaba.com Link: https://lkml.kernel.org/r/71a4b6c22f208728fe8c78ad26375436c4ff9704.1636275127.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Zi Yan Cc: Steven Rostedt (VMware) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_migration.rst | 12 ++++++------ mm/migrate.c | 17 ++++++++--------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 08810f549f70..8c5cb8147e55 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -263,15 +263,15 @@ Monitoring Migration The following events (counters) can be used to monitor page migration. 1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a - page was migrated. If the page was a non-THP page, then this counter is - increased by one. If the page was a THP, then this counter is increased by - the number of THP subpages. For example, migration of a single 2MB THP that - has 4KB-size base pages (subpages) will cause this counter to increase by - 512. + page was migrated. If the page was a non-THP and non-hugetlb page, then + this counter is increased by one. If the page was a THP or hugetlb, then + this counter is increased by the number of THP or hugetlb subpages. + For example, migration of a single 2MB THP that has 4KB-size base pages + (subpages) will cause this counter to increase by 512. 2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, - if it was a THP. + if it was a THP or hugetlb. 3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. diff --git a/mm/migrate.c b/mm/migrate.c index 57bc8b97490c..6f04aa2a3bd4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1429,9 +1429,9 @@ static inline int try_split_thp(struct page *page, struct page **page2, * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of {normal page, THP} that were not migrated, or an error code. - * The number of THP splits will be considered as the number of non-migrated THP, - * no matter how many subpages of the THP are migrated successfully. + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@ -1474,7 +1474,7 @@ retry: * during migration. */ is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = thp_nr_pages(page); + nr_subpages = compound_nr(page); cond_resched(); if (PageHuge(page)) @@ -1523,7 +1523,7 @@ retry: /* Hugetlb migration is unsupported */ if (!no_subpage_counting) nr_failed++; - nr_failed_pages++; + nr_failed_pages += nr_subpages; break; case -ENOMEM: /* @@ -1544,7 +1544,7 @@ retry: if (!no_subpage_counting) nr_failed++; - nr_failed_pages++; + nr_failed_pages += nr_subpages; goto out; case -EAGAIN: if (is_thp) { @@ -1554,12 +1554,11 @@ retry: retry++; break; case MIGRATEPAGE_SUCCESS: + nr_succeeded += nr_subpages; if (is_thp) { nr_thp_succeeded++; - nr_succeeded += nr_subpages; break; } - nr_succeeded++; break; default: /* @@ -1576,7 +1575,7 @@ retry: if (!no_subpage_counting) nr_failed++; - nr_failed_pages++; + nr_failed_pages += nr_subpages; break; } } -- cgit From 84b328aa81216e08804d8875d63f26bda1298788 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:40 -0800 Subject: mm: compaction: fix the migration stats in trace_mm_compaction_migratepages() Now the migrate_pages() has changed to return the number of {normal page, THP, hugetlb} instead, thus we should not use the return value to calculate the number of pages migrated successfully. Instead we can just use the 'nr_succeeded' which indicates the number of normal pages migrated successfully to calculate the non-migrated pages in trace_mm_compaction_migratepages(). Link: https://lkml.kernel.org/r/b4225251c4bec068dcd90d275ab7de88a39e2bd7.1636275127.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Steven Rostedt (VMware) Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 24 ++++-------------------- mm/compaction.c | 7 ++++--- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 54e5bf081171..7d48e7079e48 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -68,10 +68,9 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned long nr_all, - int migrate_rc, - struct list_head *migratepages), + unsigned int nr_succeeded), - TP_ARGS(nr_all, migrate_rc, migratepages), + TP_ARGS(nr_all, nr_succeeded), TP_STRUCT__entry( __field(unsigned long, nr_migrated) @@ -79,23 +78,8 @@ TRACE_EVENT(mm_compaction_migratepages, ), TP_fast_assign( - unsigned long nr_failed = 0; - struct list_head *page_lru; - - /* - * migrate_pages() returns either a non-negative number - * with the number of pages that failed migration, or an - * error code, in which case we need to count the remaining - * pages manually - */ - if (migrate_rc >= 0) - nr_failed = migrate_rc; - else - list_for_each(page_lru, migratepages) - nr_failed++; - - __entry->nr_migrated = nr_all - nr_failed; - __entry->nr_failed = nr_failed; + __entry->nr_migrated = nr_succeeded; + __entry->nr_failed = nr_all - nr_succeeded; ), TP_printk("nr_migrated=%lu nr_failed=%lu", diff --git a/mm/compaction.c b/mm/compaction.c index 6e446094ce90..b4e94cda3019 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2280,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + unsigned int nr_succeeded = 0; /* * These counters track activities during zone compaction. Initialize @@ -2398,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION, NULL); + MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc->nr_migratepages, err, - &cc->migratepages); + trace_mm_compaction_migratepages(cc->nr_migratepages, + nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; -- cgit From ac16ec835314677dd7405dfb5a5e007c3ca424c7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:43 -0800 Subject: mm: migrate: support multiple target nodes demotion We have some machines with multiple memory types like below, which have one fast (DRAM) memory node and two slow (persistent memory) memory nodes. According to current node demotion policy, if node 0 fills up, its memory should be migrated to node 1, when node 1 fills up, its memory will be migrated to node 2: node 0 -> node 1 -> node 2 ->stop. But this is not efficient and suitbale memory migration route for our machine with multiple slow memory nodes. Since the distance between node 0 to node 1 and node 0 to node 2 is equal, and memory migration between slow memory nodes will increase persistent memory bandwidth greatly, which will hurt the whole system's performance. Thus for this case, we can treat the slow memory node 1 and node 2 as a whole slow memory region, and we should migrate memory from node 0 to node 1 and node 2 if node 0 fills up. This patch changes the node_demotion data structure to support multiple target nodes, and establishes the migration path to support multiple target nodes with validating if the node distance is the best or not. available: 3 nodes (0-2) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 0 size: 62153 MB node 0 free: 55135 MB node 1 cpus: node 1 size: 127007 MB node 1 free: 126930 MB node 2 cpus: node 2 size: 126968 MB node 2 free: 126878 MB node distances: node 0 1 2 0: 10 20 20 1: 20 10 20 2: 20 20 10 Link: https://lkml.kernel.org/r/00728da107789bb4ed9e0d28b1d08fd8056af2ef.1636697263.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: "Huang, Ying" Cc: Dave Hansen Cc: Zi Yan Cc: Oscar Salvador Cc: Yang Shi Cc: Baolin Wang Cc: zhongjiang-ali Cc: Xunlei Pang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 129 insertions(+), 35 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 6f04aa2a3bd4..9d2642a34018 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -1118,12 +1119,25 @@ out: * * This is represented in the node_demotion[] like this: * - * { 1, // Node 0 migrates to 1 - * 2, // Node 1 migrates to 2 - * -1, // Node 2 does not migrate - * 4, // Node 3 migrates to 4 - * 5, // Node 4 migrates to 5 - * -1} // Node 5 does not migrate + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + * 0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate */ /* @@ -1134,8 +1148,20 @@ out: * must be held over all reads to ensure that no cycles are * observed. */ -static int node_demotion[MAX_NUMNODES] __read_mostly = - {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; +#define DEFAULT_DEMOTION_TARGET_NODES 15 + +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES +#endif + +struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; +}; + +static struct demotion_nodes *node_demotion __read_mostly; /** * next_demotion_node() - Get the next node in the demotion path @@ -1148,8 +1174,15 @@ static int node_demotion[MAX_NUMNODES] __read_mostly = */ int next_demotion_node(int node) { + struct demotion_nodes *nd; + unsigned short target_nr, index; int target; + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + /* * node_demotion[] is updated without excluding this * function from running. RCU doesn't provide any @@ -1160,9 +1193,28 @@ int next_demotion_node(int node) * node_demotion[] reads need to be consistent. */ rcu_read_lock(); - target = READ_ONCE(node_demotion[node]); - rcu_read_unlock(); + target_nr = READ_ONCE(nd->nr); + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + */ + index = get_random_int() % target_nr; + break; + } + + target = READ_ONCE(nd->nodes[index]); + +out: + rcu_read_unlock(); return target; } @@ -3003,10 +3055,16 @@ EXPORT_SYMBOL(migrate_vma_finalize); /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { - int node; + int node, i; - for_each_online_node(node) - node_demotion[node] = NUMA_NO_NODE; + if (!node_demotion) + return; + + for_each_online_node(node) { + node_demotion[node].nr = 0; + for (i = 0; i < DEMOTION_TARGET_NODES; i++) + node_demotion[node].nodes[i] = NUMA_NO_NODE; + } } static void disable_all_migrate_targets(void) @@ -3033,26 +3091,40 @@ static void disable_all_migrate_targets(void) * Failing here is OK. It might just indicate * being at the end of a chain. */ -static int establish_migrate_target(int node, nodemask_t *used) +static int establish_migrate_target(int node, nodemask_t *used, + int best_distance) { - int migration_target; + int migration_target, index, val; + struct demotion_nodes *nd; - /* - * Can not set a migration target on a - * node with it already set. - * - * No need for READ_ONCE() here since this - * in the write path for node_demotion[]. - * This should be the only thread writing. - */ - if (node_demotion[node] != NUMA_NO_NODE) + if (!node_demotion) return NUMA_NO_NODE; + nd = &node_demotion[node]; + migration_target = find_next_best_node(node, used); if (migration_target == NUMA_NO_NODE) return NUMA_NO_NODE; - node_demotion[node] = migration_target; + /* + * If the node has been set a migration target node before, + * which means it's the best distance between them. Still + * check if this node can be demoted to other target nodes + * if they have a same best distance. + */ + if (best_distance != -1) { + val = node_distance(node, migration_target); + if (val > best_distance) + return NUMA_NO_NODE; + } + + index = nd->nr; + if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, + "Exceeds maximum demotion target nodes\n")) + return NUMA_NO_NODE; + + nd->nodes[index] = migration_target; + nd->nr++; return migration_target; } @@ -3068,7 +3140,9 @@ static int establish_migrate_target(int node, nodemask_t *used) * * The difference here is that cycles must be avoided. If * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. + * node1 migrates to can migrate to node0. Also one node can + * be migrated to multiple nodes if the target nodes all have + * a same best-distance against the source node. * * This function can run simultaneously with readers of * node_demotion[]. However, it can not run simultaneously @@ -3080,7 +3154,7 @@ static void __set_migration_target_nodes(void) nodemask_t next_pass = NODE_MASK_NONE; nodemask_t this_pass = NODE_MASK_NONE; nodemask_t used_targets = NODE_MASK_NONE; - int node; + int node, best_distance; /* * Avoid any oddities like cycles that could occur @@ -3109,18 +3183,33 @@ again: * multiple source nodes to share a destination. */ nodes_or(used_targets, used_targets, this_pass); - for_each_node_mask(node, this_pass) { - int target_node = establish_migrate_target(node, &used_targets); - if (target_node == NUMA_NO_NODE) - continue; + for_each_node_mask(node, this_pass) { + best_distance = -1; /* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. + * Try to set up the migration path for the node, and the target + * migration nodes can be multiple, so doing a loop to find all + * the target nodes if they all have a best node distance. */ - node_set(target_node, next_pass); + do { + int target_node = + establish_migrate_target(node, &used_targets, + best_distance); + + if (target_node == NUMA_NO_NODE) + break; + + if (best_distance == -1) + best_distance = node_distance(node, target_node); + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } while (1); } /* * 'next_pass' contains nodes which became migration @@ -3221,6 +3310,11 @@ static int __init migrate_on_reclaim_init(void) { int ret; + node_demotion = kmalloc_array(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); + ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", NULL, migration_offline_cpu); /* -- cgit From 7813a1b5257b8eb2cb915cd08e7ba857070fdfd3 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:46 -0800 Subject: mm: migrate: add more comments for selecting target node randomly As Yang Shi suggested [1], it will be helpful to explain why we should select target node randomly now if there are multiple target nodes. [1] https://lore.kernel.org/all/CAHbLzkqSqCL+g7dfzeOw8fPyeEC0BBv13Ny1UVGHDkadnQdR=g@mail.gmail.com/ Link: https://lkml.kernel.org/r/c31d36bd097c6e9e69fc0f409c43b78e53e64fc2.1637766801.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Cc: "Huang, Ying" Cc: Dave Hansen Cc: Zi Yan Cc: zhongjiang-ali Cc: Xunlei Pang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 9d2642a34018..f50087d3ebf2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1206,6 +1206,14 @@ int next_demotion_node(int node) /* * If there are multiple target nodes, just select one * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. */ index = get_random_int() % target_nr; break; -- cgit From dcee9bf5bf2f59c173f3645ac2274595ac6c6aea Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 14 Jan 2022 14:08:49 -0800 Subject: mm/migrate: move node demotion code to near its user Now, node_demotion and next_demotion_node() are placed between __unmap_and_move() and unmap_and_move(). This hurts code readability. So move them near their users in the file. There's no functionality change in this patch. Link: https://lkml.kernel.org/r/20211206031227.3323097-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Reviewed-by: Yang Shi Reviewed-by: Wei Xu Cc: Dave Hansen Cc: Zi Yan Cc: Oscar Salvador Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 265 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 132 insertions(+), 133 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index f50087d3ebf2..e50b80534d80 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1093,139 +1093,6 @@ out: return rc; } - -/* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 - * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 - * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate - * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 - * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 - * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate - * - * Moreover some systems may have multiple slow memory nodes. - * Suppose a system has one socket with 3 memory nodes, node 0 - * is fast memory type, and node 1/2 both are slow memory - * type, and the distance between fast memory node and slow - * memory node is same. So the migration path should be: - * - * 0 -> 1/2 -> stop - * - * This is represented in the node_demotion[] like this: - * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 - * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate - * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate - */ - -/* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -#define DEFAULT_DEMOTION_TARGET_NODES 15 - -#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES -#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) -#else -#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES -#endif - -struct demotion_nodes { - unsigned short nr; - short nodes[DEMOTION_TARGET_NODES]; -}; - -static struct demotion_nodes *node_demotion __read_mostly; - -/** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal. This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ -int next_demotion_node(int node) -{ - struct demotion_nodes *nd; - unsigned short target_nr, index; - int target; - - if (!node_demotion) - return NUMA_NO_NODE; - - nd = &node_demotion[node]; - - /* - * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. - * - * Make sure to use RCU over entire code blocks if - * node_demotion[] reads need to be consistent. - */ - rcu_read_lock(); - target_nr = READ_ONCE(nd->nr); - - switch (target_nr) { - case 0: - target = NUMA_NO_NODE; - goto out; - case 1: - index = 0; - break; - default: - /* - * If there are multiple target nodes, just select one - * target node randomly. - * - * In addition, we can also use round-robin to select - * target node, but we should introduce another variable - * for node_demotion[] to record last selected target node, - * that may cause cache ping-pong due to the changing of - * last target node. Or introducing per-cpu data to avoid - * caching issue, which seems more complicated. So selecting - * target node randomly seems better until now. - */ - index = get_random_int() % target_nr; - break; - } - - target = READ_ONCE(nd->nodes[index]); - -out: - rcu_read_unlock(); - return target; -} - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -3059,6 +2926,138 @@ void migrate_vma_finalize(struct migrate_vma *migrate) EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + * 0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +#define DEFAULT_DEMOTION_TARGET_NODES 15 + +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES +#endif + +struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; +}; + +static struct demotion_nodes *node_demotion __read_mostly; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + struct demotion_nodes *nd; + unsigned short target_nr, index; + int target; + + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target_nr = READ_ONCE(nd->nr); + + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + index = get_random_int() % target_nr; + break; + } + + target = READ_ONCE(nd->nodes[index]); + +out: + rcu_read_unlock(); + return target; +} + #if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) -- cgit From f1e8db04b68cc56edc5baee5c7cb1f9b79c3da7e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 14 Jan 2022 14:08:53 -0800 Subject: mm/migrate: remove redundant variables used in a for-loop The variable addr is being set and incremented in a for-loop but not actually being used. It is redundant and so addr and also variable start can be removed. Link: https://lkml.kernel.org/r/20211221185729.609630-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index e50b80534d80..05af2b2336b9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2481,8 +2481,7 @@ static bool migrate_vma_check_page(struct page *page) static void migrate_vma_unmap(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - unsigned long addr, i, restore = 0; + unsigned long i, restore = 0; bool allow_drain = true; lru_add_drain(); @@ -2528,7 +2527,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } } - for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + for (i = 0; i < npages && restore; i++) { struct page *page = migrate_pfn_to_page(migrate->src[i]); if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) -- cgit From c0e582de6066e97c83a466f0e5983e3148123526 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 14 Jan 2022 14:08:56 -0800 Subject: mm/thp: drop unused trace events hugepage_[invalidate|splitting] The trace events hugepage_[invalidate|splitting], were added via the commit 9e813308a5c1 ("powerpc/thp: Add tracepoints to track hugepage invalidate"). Afterwards their call sites i.e trace_hugepage_[invalidate|splitting] were just dropped off, leaving these trace points unused. Link: https://lkml.kernel.org/r/1641546351-15109-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: David Hildenbrand Cc: Steven Rostedt Cc: Ingo Molnar Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/thp.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index d7fbbe551841..ca3f2767828a 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -8,24 +8,6 @@ #include #include -TRACE_EVENT(hugepage_invalidate, - - TP_PROTO(unsigned long addr, unsigned long pte), - TP_ARGS(addr, pte), - TP_STRUCT__entry( - __field(unsigned long, addr) - __field(unsigned long, pte) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->pte = pte; - ), - - TP_printk("hugepage invalidate at addr 0x%lx and pte = 0x%lx", - __entry->addr, __entry->pte) -); - TRACE_EVENT(hugepage_set_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), @@ -65,23 +47,6 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); -TRACE_EVENT(hugepage_splitting, - - TP_PROTO(unsigned long addr, unsigned long pte), - TP_ARGS(addr, pte), - TP_STRUCT__entry( - __field(unsigned long, addr) - __field(unsigned long, pte) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->pte = pte; - ), - - TP_printk("hugepage splitting at addr 0x%lx and pte = 0x%lx", - __entry->addr, __entry->pte) -); #endif /* _TRACE_THP_H */ -- cgit From e1c63e110f977205ab9dfb38989c54e6e7b52a7b Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Fri, 14 Jan 2022 14:08:59 -0800 Subject: mm: ksm: fix use-after-free kasan report in ksm_might_need_to_copy When under the stress of swapping in/out with KSM enabled, there is a low probability that kasan reports the BUG of use-after-free in ksm_might_need_to_copy() when do swap in. The freed object is the anon_vma got from page_anon_vma(page). It is because a swapcache page associated with one anon_vma now needed for another anon_vma, but the page's original vma was unmapped and the anon_vma was freed. In this case the if condition below always return false and then alloc a new page to copy. Swapin process then use the new page and can continue to run well, so this is harmless actually. } else if (anon_vma->root == vma->anon_vma->root && page->index == linear_page_index(vma, address)) { This patch exchange the order of above two judgment statement to avoid the kasan warning. Let cpu run "page->index == linear_page_index(vma, address)" firstly and return false basically to skip the read of anon_vma->root which may trigger the kasan use-after-free warning: ================================================================== BUG: KASAN: use-after-free in ksm_might_need_to_copy+0x12e/0x5b0 Read of size 8 at addr ffff88be9977dbd0 by task khugepaged/694 CPU: 8 PID: 694 Comm: khugepaged Kdump: loaded Tainted: G OE - 4.18.0.x86_64 Hardware name: 1288H V5/BC11SPSC0, BIOS 7.93 01/14/2021 Call Trace: dump_stack+0xf1/0x19b print_address_description+0x70/0x360 kasan_report+0x1b2/0x330 ksm_might_need_to_copy+0x12e/0x5b0 do_swap_page+0x452/0xe70 __collapse_huge_page_swapin+0x24b/0x720 khugepaged_scan_pmd+0xcae/0x1ff0 khugepaged+0x8ee/0xd70 kthread+0x1a2/0x1d0 ret_from_fork+0x1f/0x40 Allocated by task 2306153: kasan_kmalloc+0xa0/0xd0 kmem_cache_alloc+0xc0/0x1c0 anon_vma_clone+0xf7/0x380 anon_vma_fork+0xc0/0x390 copy_process+0x447b/0x4810 _do_fork+0x118/0x620 do_syscall_64+0x112/0x360 entry_SYSCALL_64_after_hwframe+0x65/0xca Freed by task 2306242: __kasan_slab_free+0x130/0x180 kmem_cache_free+0x78/0x1d0 unlink_anon_vmas+0x19c/0x4a0 free_pgtables+0x137/0x1b0 exit_mmap+0x133/0x320 mmput+0x15e/0x390 do_exit+0x8c5/0x1210 do_group_exit+0xb5/0x1b0 __x64_sys_exit_group+0x21/0x30 do_syscall_64+0x112/0x360 entry_SYSCALL_64_after_hwframe+0x65/0xca The buggy address belongs to the object at ffff88be9977dba0 which belongs to the cache anon_vma_chain of size 64 The buggy address is located 48 bytes inside of 64-byte region [ffff88be9977dba0, ffff88be9977dbe0) The buggy address belongs to the page: page:ffffea00fa65df40 count:1 mapcount:0 mapping:ffff888107717800 index:0x0 flags: 0x17ffffc0000100(slab) ================================================================== Link: https://lkml.kernel.org/r/20211202102940.1069634-1-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Cc: Hugh Dickins Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index f34476ac0a41..c20bd4d9a0d9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2576,8 +2576,8 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (anon_vma->root == vma->anon_vma->root && - page->index == linear_page_index(vma, address)) { + } else if (page->index == linear_page_index(vma, address) && + anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (!PageUptodate(page)) -- cgit From 91d005479e06392617bacc114509d611b705eaac Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:02 -0800 Subject: mm/hwpoison: mf_mutex for soft offline and unpoison Patch series "mm/hwpoison: fix unpoison_memory()", v4. The main purpose of this series is to sync unpoison code to recent changes around how hwpoison code takes page refcount. Unpoison should work or simply fail (without crash) if impossible. The recent works of keeping hwpoison pages in shmem pagecache introduce a new state of hwpoisoned pages, but unpoison for such pages is not supported yet with this series. It seems that soft-offline and unpoison can be used as general purpose page offline/online mechanism (not in the context of memory error). I think that we need some additional works to realize it because currently soft-offline and unpoison are assumed not to happen so frequently (print out too many messages for aggressive usecases). But anyway this could be another interesting next topic. v1: https://lore.kernel.org/linux-mm/20210614021212.223326-1-nao.horiguchi@gmail.com/ v2: https://lore.kernel.org/linux-mm/20211025230503.2650970-1-naoya.horiguchi@linux.dev/ v3: https://lore.kernel.org/linux-mm/20211105055058.3152564-1-naoya.horiguchi@linux.dev/ This patch (of 3): Originally mf_mutex is introduced to serialize multiple MCE events, but it is not that useful to allow unpoison to run in parallel with memory_failure() and soft offline. So apply mf_mutex to soft offline and unpoison. The memory failure handler and soft offline handler get simpler with this. Link: https://lkml.kernel.org/r/20211115084006.3728254-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20211115084006.3728254-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: Ding Hui Cc: Miaohe Lin Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 62 ++++++++++++++++------------------------------------- 1 file changed, 18 insertions(+), 44 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5f8ad5527506..607785491a52 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1502,14 +1502,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) lock_page(head); page_flags = head->flags; - if (!PageHWPoison(head)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(head); - put_page(head); - return 0; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1623,6 +1615,8 @@ out: return rc; } +static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1649,7 +1643,6 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1783,16 +1776,6 @@ try_again: */ page_flags = p->flags; - /* - * unpoison always clear PG_hwpoison inside page lock - */ - if (!PageHWPoison(p)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(p); - put_page(p); - goto unlock_mutex; - } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); @@ -1973,6 +1956,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + int ret = 0; unsigned long flags = 0; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1983,39 +1967,30 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; - } - - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", - pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (!get_hwpoison_page(p, flags)) { @@ -2023,29 +1998,23 @@ int unpoison_memory(unsigned long pfn) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } - lock_page(page); - /* - * This test is racy because PG_hwpoison is set outside of page lock. - * That's acceptable because that won't trigger kernel panic. Instead, - * the PG_hwpoison page will be caught and isolated on the entrance to - * the free buddy page pool. - */ if (TestClearPageHWPoison(page)) { unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); num_poisoned_pages_dec(); freeit = 1; } - unlock_page(page); put_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); - return 0; +unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory); @@ -2226,9 +2195,12 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); + mutex_unlock(&mf_mutex); return 0; } @@ -2247,5 +2219,7 @@ retry: } } + mutex_unlock(&mf_mutex); + return ret; } -- cgit From c9fdc4d5487a16bd1f003fc8b66e91f88efb50e6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:06 -0800 Subject: mm/hwpoison: remove MF_MSG_BUDDY_2ND and MF_MSG_POISONED_HUGE These action_page_types are no longer used, so remove them. Link: https://lkml.kernel.org/r/20211115084006.3728254-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Acked-by: Yang Shi Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: Ding Hui Cc: Miaohe Lin Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- include/ras/ras_event.h | 2 -- mm/memory-failure.c | 2 -- 3 files changed, 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index eb67eb699b78..7f594da84aca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3201,7 +3201,6 @@ enum mf_action_page_type { MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_NON_PMD_HUGE, @@ -3216,7 +3215,6 @@ enum mf_action_page_type { MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 0bdbc0d17d2f..d0337a41141c 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -358,7 +358,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ EM ( MF_MSG_SLAB, "kernel slab page" ) \ EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ - EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ @@ -373,7 +372,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ EM ( MF_MSG_BUDDY, "free buddy page" ) \ - EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 607785491a52..810328fe8adb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -723,7 +723,6 @@ static const char * const action_page_types[] = { [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", [MF_MSG_SLAB] = "kernel slab page", [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", @@ -738,7 +737,6 @@ static const char * const action_page_types[] = { [MF_MSG_CLEAN_LRU] = "clean LRU page", [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", - [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", -- cgit From bf181c582588f8f7406d52f2ee228539b465f173 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:09 -0800 Subject: mm/hwpoison: fix unpoison_memory() After recent soft-offline rework, error pages can be taken off from buddy allocator, but the existing unpoison_memory() does not properly undo the operation. Moreover, due to the recent change on __get_hwpoison_page(), get_page_unless_zero() is hardly called for hwpoisoned pages. So __get_hwpoison_page() highly likely returns -EBUSY (meaning to fail to grab page refcount) and unpoison just clears PG_hwpoison without releasing a refcount. That does not lead to a critical issue like kernel panic, but unpoisoned pages never get back to buddy (leaked permanently), which is not good. To (partially) fix this, we need to identify "taken off" pages from other types of hwpoisoned pages. We can't use refcount or page flags for this purpose, so a pseudo flag is defined by hacking ->private field. Someone might think that put_page() is enough to cancel taken-off pages, but the normal free path contains some operations not suitable for the current purpose, and can fire VM_BUG_ON(). Note that unpoison_memory() is now supposed to be cancel hwpoison events injected only by madvise() or /sys/devices/system/memory/{hard,soft}_offline_page, not by MCE injection, so please don't try to use unpoison when testing with MCE injection. [lkp@intel.com: report build failure for ARCH=i386] Link: https://lkml.kernel.org/r/20211115084006.3728254-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Ding Hui Cc: Tony Luck Cc: "Aneesh Kumar K.V" Cc: Miaohe Lin Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/linux/page-flags.h | 4 ++ mm/memory-failure.c | 109 +++++++++++++++++++++++++++++++++++++-------- mm/page_alloc.c | 27 +++++++++++ 4 files changed, 122 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7f594da84aca..d4fb49a5d60d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3174,6 +3174,7 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 18423c2157e8..7e2b90dc7d3f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -522,7 +522,11 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) +#define MAGIC_HWPOISON 0x48575053U /* HWPS */ +extern void SetPageHWPoisonTakenOff(struct page *page); +extern void ClearPageHWPoisonTakenOff(struct page *page); extern bool take_page_off_buddy(struct page *page); +extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 810328fe8adb..6a2b4b86b679 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1160,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +static inline bool PageHWPoisonTakenOff(struct page *page) +{ + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; +} + +void SetPageHWPoisonTakenOff(struct page *page) +{ + set_page_private(page, MAGIC_HWPOISON); +} + +void ClearPageHWPoisonTakenOff(struct page *page) +{ + if (PageHWPoison(page)) + set_page_private(page, 0); +} + /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function @@ -1262,6 +1278,27 @@ out: return ret; } +static int __get_unpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; +} + /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) @@ -1278,18 +1315,26 @@ out: * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * * Return: 0 on failure, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle - * operations like allocation and free. + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret; zone_pcp_disable(page_zone(p)); - ret = get_any_page(p, flags); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p)); return ret; @@ -1937,6 +1982,28 @@ core_initcall(memory_failure_init); pr_info(fmt, pfn); \ }) +static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) +{ + if (TestClearPageHWPoison(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + num_poisoned_pages_dec(); + return 1; + } + return 0; +} + +static inline int unpoison_taken_off_page(struct ratelimit_state *rs, + struct page *p) +{ + if (put_page_back_buddy(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + return 0; + } + return -EBUSY; +} + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1953,9 +2020,7 @@ int unpoison_memory(unsigned long pfn) { struct page *page; struct page *p; - int freeit = 0; - int ret = 0; - unsigned long flags = 0; + int ret = -EBUSY; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1991,24 +2056,30 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (!get_hwpoison_page(p, flags)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", - pfn, &unpoison_rs); + if (PageSlab(page) || PageTable(page)) goto unlock_mutex; - } - if (TestClearPageHWPoison(page)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - pfn, &unpoison_rs); - num_poisoned_pages_dec(); - freeit = 1; - } + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (clear_page_hwpoison(&unpoison_rs, page)) + ret = 0; + else + ret = -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = unpoison_taken_off_page(&unpoison_rs, p); + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + int freeit = clear_page_hwpoison(&unpoison_rs, p); - put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + put_page(page); + ret = 0; + } + } unlock_mutex: mutex_unlock(&mf_mutex); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 99fc65c532f0..d4205e5e41d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -9508,6 +9509,7 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + SetPageHWPoisonTakenOff(page); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, -1, migratetype); ret = true; @@ -9519,6 +9521,31 @@ bool take_page_off_buddy(struct page *page) spin_unlock_irqrestore(&zone->lock, flags); return ret; } + +/* + * Cancel takeoff done by take_page_off_buddy(). + */ +bool put_page_back_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { + num_poisoned_pages_dec(); + ret = true; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return ret; +} #endif #ifdef CONFIG_ZONE_DMA -- cgit From 8c57c07741bf28e7d867f1200aa80120b8ca663e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 14 Jan 2022 14:09:12 -0800 Subject: mm: memcg/percpu: account extra objcg space to memory cgroups Similar to slab memory allocator, for each accounted percpu object there is an extra space which is used to store obj_cgroup membership. Charge it too. [akpm@linux-foundation.org: fix layout] Link: https://lkml.kernel.org/r/20211126040606.97836-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu-internal.h | 18 ++++++++++++++++++ mm/percpu.c | 10 +++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 639662c20c82..411d1593ef23 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } +#ifdef CONFIG_MEMCG_KMEM +/** + * pcpu_obj_full_size - helper to calculate size of each accounted object + * @size: size of area to allocate in bytes + * + * For each accounted object there is an extra space which is used to store + * obj_cgroup membership. Charge it too. + */ +static inline size_t pcpu_obj_full_size(size_t size) +{ + size_t extra_size; + + extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); + + return size * num_possible_cpus() + extra_size; +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_PERCPU_STATS #include diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a54..4199a0604c32 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) { + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { obj_cgroup_put(objcg); return false; } @@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - size * num_possible_cpus()); + pcpu_obj_full_size(size)); rcu_read_unlock(); } else { - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); obj_cgroup_put(objcg); } } @@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) return; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - -(size * num_possible_cpus())); + -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); -- cgit From 5ee2fa2f063649570c702164f47a558a3432dd9e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 14 Jan 2022 14:09:16 -0800 Subject: mm/rmap: fix potential batched TLB flush race In theory, the following race is possible for batched TLB flushing. CPU0 CPU1 ---- ---- shrink_page_list() unmap zap_pte_range() flush_tlb_batched_pending() flush_tlb_mm() try_to_unmap() set_tlb_ubc_flush_pending() mm->tlb_flush_batched = true mm->tlb_flush_batched = false After the TLB is flushed on CPU1 via flush_tlb_mm() and before mm->tlb_flush_batched is set to false, some PTE is unmapped on CPU0 and the TLB flushing is pended. Then the pended TLB flushing will be lost. Although both set_tlb_ubc_flush_pending() and flush_tlb_batched_pending() are called with PTL locked, different PTL instances may be used. Because the race window is really small, and the lost TLB flushing will cause problem only if a TLB entry is inserted before the unmapping in the race window, the race is only theoretical. But the fix is simple and cheap too. Syzbot has reported this too as follows: ================================================================== BUG: KCSAN: data-race in flush_tlb_batched_pending / try_to_unmap_one write to 0xffff8881072cfbbc of 1 bytes by task 17406 on cpu 1: flush_tlb_batched_pending+0x5f/0x80 mm/rmap.c:691 madvise_free_pte_range+0xee/0x7d0 mm/madvise.c:594 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0x981/0x1160 mm/pagewalk.c:379 walk_page_range+0x131/0x300 mm/pagewalk.c:475 madvise_free_single_vma mm/madvise.c:734 [inline] madvise_dontneed_free mm/madvise.c:822 [inline] madvise_vma mm/madvise.c:996 [inline] do_madvise+0xe4a/0x1140 mm/madvise.c:1202 __do_sys_madvise mm/madvise.c:1228 [inline] __se_sys_madvise mm/madvise.c:1226 [inline] __x64_sys_madvise+0x5d/0x70 mm/madvise.c:1226 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881072cfbbc of 1 bytes by task 71 on cpu 0: set_tlb_ubc_flush_pending mm/rmap.c:636 [inline] try_to_unmap_one+0x60e/0x1220 mm/rmap.c:1515 rmap_walk_anon+0x2fb/0x470 mm/rmap.c:2301 try_to_unmap+0xec/0x110 shrink_page_list+0xe91/0x2620 mm/vmscan.c:1719 shrink_inactive_list+0x3fb/0x730 mm/vmscan.c:2394 shrink_list mm/vmscan.c:2621 [inline] shrink_lruvec+0x3c9/0x710 mm/vmscan.c:2940 shrink_node_memcgs+0x23e/0x410 mm/vmscan.c:3129 shrink_node+0x8f6/0x1190 mm/vmscan.c:3252 kswapd_shrink_node mm/vmscan.c:4022 [inline] balance_pgdat+0x702/0xd30 mm/vmscan.c:4213 kswapd+0x200/0x340 mm/vmscan.c:4473 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x01 -> 0x00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 71 Comm: kswapd0 Not tainted 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 ================================================================== [akpm@linux-foundation.org: tweak comments] Link: https://lkml.kernel.org/r/20211201021104.126469-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reported-by: syzbot+aa5bebed695edaccf0df@syzkaller.appspotmail.com Cc: Nadav Amit Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Dave Hansen Cc: Will Deacon Cc: Yu Zhao Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- mm/rmap.c | 43 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a89f128c990..e3b0476a4fda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -647,7 +647,7 @@ struct mm_struct { atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ - bool tlb_flush_batched; + atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT diff --git a/mm/rmap.c b/mm/rmap.c index 163ac4e6bcee..6a1e8c7f6213 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void) try_to_unmap_flush(); } +/* + * Bits 0-14 of mm->tlb_flush_batched record pending generations. + * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. + */ +#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 +#define TLB_FLUSH_BATCH_PENDING_MASK \ + ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) +#define TLB_FLUSH_BATCH_PENDING_LARGE \ + (TLB_FLUSH_BATCH_PENDING_MASK / 2) + static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int batch, nbatch; arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; @@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) * before the PTE is cleared. */ barrier(); - mm->tlb_flush_batched = true; + batch = atomic_read(&mm->tlb_flush_batched); +retry: + if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { + /* + * Prevent `pending' from catching up with `flushed' because of + * overflow. Reset `pending' and `flushed' to be 1 and 0 if + * `pending' becomes large. + */ + nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1); + if (nbatch != batch) { + batch = nbatch; + goto retry; + } + } else { + atomic_inc(&mm->tlb_flush_batched); + } /* * If the PTE was dirty then it's best to assume it's writable. The @@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ void flush_tlb_batched_pending(struct mm_struct *mm) { - if (data_race(mm->tlb_flush_batched)) { - flush_tlb_mm(mm); + int batch = atomic_read(&mm->tlb_flush_batched); + int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; + int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; + if (pending != flushed) { + flush_tlb_mm(mm); /* - * Do not allow the compiler to re-order the clearing of - * tlb_flush_batched before the tlb is flushed. + * If the new TLB flushing is pending during flushing, leave + * mm->tlb_flush_batched as is, to avoid losing flushing. */ - barrier(); - mm->tlb_flush_batched = false; + atomic_cmpxchg(&mm->tlb_flush_batched, batch, + pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else -- cgit From f44e1e697674335f837280d5e4485d1523206ea9 Mon Sep 17 00:00:00 2001 From: Zhaoyu Liu Date: Fri, 14 Jan 2022 14:09:19 -0800 Subject: zpool: remove the list of pools_head The list of pools_head is no longer needed because the caller has been deleted in commit 479305fd7172 ("zpool: remove zpool_evict()"). Link: https://lkml.kernel.org/r/20211215163727.GA17196@pc Signed-off-by: Zhaoyu Liu Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zpool.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/mm/zpool.c b/mm/zpool.c index 6d9ed48141e5..68facc193496 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -24,16 +24,11 @@ struct zpool { const struct zpool_ops *ops; bool evictable; bool can_sleep_mapped; - - struct list_head list; }; static LIST_HEAD(drivers_head); static DEFINE_SPINLOCK(drivers_lock); -static LIST_HEAD(pools_head); -static DEFINE_SPINLOCK(pools_lock); - /** * zpool_register_driver() - register a zpool implementation. * @driver: driver to register @@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, pr_debug("created pool type %s\n", type); - spin_lock(&pools_lock); - list_add(&zpool->list, &pools_head); - spin_unlock(&pools_lock); - return zpool; } @@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool) { pr_debug("destroying pool type %s\n", zpool->driver->type); - spin_lock(&pools_lock); - list_del(&zpool->list); - spin_unlock(&pools_lock); zpool->driver->destroy(zpool->pool); zpool_put_driver(zpool->driver); kfree(zpool); -- cgit From 7f0d267243aa9dd32944bd7d3b34afff60545edb Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 14 Jan 2022 14:09:22 -0800 Subject: zram: use ATTRIBUTE_GROUPS Embrace ATTRIBUTE_GROUPS to avoid boiler plate code. This should not introduce any functional changes. Link: https://lkml.kernel.org/r/20211028203600.2157356-1-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Reviewed-by: Bart Van Assche Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 25071126995b..9a46b2ef6951 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1903,14 +1903,7 @@ static struct attribute *zram_disk_attrs[] = { NULL, }; -static const struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; - -static const struct attribute_group *zram_disk_attr_groups[] = { - &zram_disk_attr_group, - NULL, -}; +ATTRIBUTE_GROUPS(zram_disk); /* * Allocate and initialize new zram device. the function returns @@ -1982,7 +1975,7 @@ static int zram_add(void) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); - ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups); + ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; -- cgit From 0b8f0d870020dbd7037bfacbb73a9b3213470f90 Mon Sep 17 00:00:00 2001 From: Quanfa Fu Date: Fri, 14 Jan 2022 14:09:25 -0800 Subject: mm: fix some comment errors Link: https://lkml.kernel.org/r/20211101040208.460810-1-fuqf0919@gmail.com Signed-off-by: Quanfa Fu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- mm/memory-failure.c | 2 +- mm/slab_common.c | 2 +- mm/swap.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 02071f213c58..7af84bac6fc2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1303,7 +1303,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max + * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a2b4b86b679..373837bb94cb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1306,7 +1306,7 @@ static int __get_unpoison_page(struct page *page) * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state - * (defined as a page-type we can successfully handle the memor error on it, + * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, diff --git a/mm/slab_common.c b/mm/slab_common.c index 1f75bd4e95d6..9513244457e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -819,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void) if (KMALLOC_MIN_SIZE >= 64) { /* - * The 96 byte size cache is not used if the alignment + * The 96 byte sized cache is not used if the alignment * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) diff --git a/mm/swap.c b/mm/swap.c index e8c9dc6d0377..b461814ce0cb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -882,7 +882,7 @@ void lru_cache_disable(void) * all online CPUs so any calls of lru_cache_disabled wrapped by * local_lock or preemption disabled would be ordered by that. * The atomic operation doesn't need to have stronger ordering - * requirements because that is enforeced by the scheduling + * requirements because that is enforced by the scheduling * guarantees. */ __lru_add_drain_all(true); -- cgit From cab0a7c115546a4865fb7439558af9077a569574 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 14 Jan 2022 14:09:28 -0800 Subject: mm: make some vars and functions static or __init "page_idle_ops" as a global var, but its scope of use within this document. So it should be static. "page_ext_ops" is a var used in the kernel initial phase. And other functions are aslo used in the kernel initial phase. So they should be __init or __initdata to reclaim memory. Link: https://lkml.kernel.org/r/20211217095023.67293-1-liuting.0x7c00@bytedance.com Signed-off-by: Ting Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_idle.h | 1 - mm/page_ext.c | 4 ++-- mm/page_owner.c | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 83abf95e9fa7..4663dfed1293 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,7 +13,6 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ -extern struct page_ext_operations page_idle_ops; static inline bool folio_test_young(struct folio *folio) { diff --git a/mm/page_ext.c b/mm/page_ext.c index bee3240604dc..2e66d934d63f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -64,12 +64,12 @@ static bool need_page_idle(void) { return true; } -struct page_ext_operations page_idle_ops = { +static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, }; #endif -static struct page_ext_operations *page_ext_ops[] = { +static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f924957ce7a..5eea061bb1e5 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf) } early_param("page_owner", early_page_owner_param); -static bool need_page_owner(void) +static __init bool need_page_owner(void) { return page_owner_enabled; } @@ -75,7 +75,7 @@ static noinline void register_early_stack(void) early_handle = create_dummy_stack(); } -static void init_page_owner(void) +static __init void init_page_owner(void) { if (!page_owner_enabled) return; -- cgit From 87c01d57fa23de82fff593a7d070933d08755801 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 14 Jan 2022 14:09:31 -0800 Subject: mm/hmm.c: allow VM_MIXEDMAP to work with hmm_range_fault hmm_range_fault() can be used instead of get_user_pages() for devices which allow faulting however unlike get_user_pages() it will return an error when used on a VM_MIXEDMAP range. To make hmm_range_fault() more closely match get_user_pages() remove this restriction. This requires dealing with the !ARCH_HAS_PTE_SPECIAL case in hmm_vma_handle_pte(). Rather than replicating the logic of vm_normal_page() call it directly and do a check for the zero pfn similar to what get_user_pages() currently does. Also add a test to hmm selftest to verify functionality. Link: https://lkml.kernel.org/r/20211104012001.2555676-1-apopple@nvidia.com Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table") Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Zi Yan Cc: Ralph Campbell Cc: Felix Kuehling Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_hmm.c | 24 +++++++++++++++++++ mm/hmm.c | 5 ++-- tools/testing/selftests/vm/hmm-tests.c | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e2ce8f9b7605..767538089a62 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, return 0; } +static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long addr; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + struct page *page; + int ret; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + + ret = vm_insert_page(vma, addr, page); + if (ret) { + __free_page(page); + return ret; + } + put_page(page); + } + + return 0; +} + static const struct file_operations dmirror_fops = { .open = dmirror_fops_open, .release = dmirror_fops_release, + .mmap = dmirror_fops_mmap, .unlocked_ioctl = dmirror_fops_unlocked_ioctl, .llseek = default_llseek, .owner = THIS_MODULE, diff --git a/mm/hmm.c b/mm/hmm.c index 842e26599238..bd56641c79d4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !pte_devmap(pte) && + if (!vm_normal_page(walk->vma, addr, pte) && + !pte_devmap(pte) && !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); @@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && vma->vm_flags & VM_READ) return 0; diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 864f126ffd78..203323967b50 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1248,6 +1248,48 @@ TEST_F(hmm, anon_teardown) } } +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm, mixedmap) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned char *m; + int ret; + + npages = 1; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + self->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); + + hmm_buffer_free(buffer); +} + /* * Test memory snapshot without faulting in pages accessed by the device. */ -- cgit From b627b774911660852ce7f3f3817955ddad2bd130 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:34 -0800 Subject: mm/damon: unified access_check function naming rules Patch series "mm/damon: Do some small changes", v4. This patch (of 4): In damon/paddr.c file, two functions names start with underscore, static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, struct damon_region *r) static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, struct damon_region *r) In damon/vaddr.c file, there are also two functions with the same function, static void damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) static void damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) It makes sense to keep consistent, and it is not easy to be confused with the function that call them. Link: https://lkml.kernel.org/r/cover.1636989871.git.xhao@linux.alibaba.com Link: https://lkml.kernel.org/r/529054aed932a42b9c09fc9977ad4574b9e7b0bd.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 20a9a9d69eb1..73c5d1aafda6 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -410,7 +410,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void damon_va_prepare_access_check(struct damon_ctx *ctx, +static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -429,7 +429,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(ctx, mm, r); mmput(mm); } } @@ -515,7 +515,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void damon_va_check_access(struct damon_ctx *ctx, +static void __damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { static struct mm_struct *last_mm; @@ -551,7 +551,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) { - damon_va_check_access(ctx, mm, r); + __damon_va_check_access(ctx, mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); -- cgit From c46b0bb6a735db0b6140e12e750b5acb1b032982 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:37 -0800 Subject: mm/damon: add 'age' of region tracepoint support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Damon, we can get age information by analyzing the nr_access change, But short time sampling is not effective, we have to obtain enough data for analysis through long time trace, this also means that we need to consume more cpu resources and storage space. Now the region add a new 'age' variable, we only need to get the change of age value through a little time trace, for example, age has been increasing to 141, but nr_access shows a value of 0 at the same time, Through this,we can conclude that the region has a very low nr_access value for a long time. Link: https://lkml.kernel.org/r/b9def1262af95e0dc1d0caea447886434db01161.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/damon.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 2f422f4f1fb9..99ffa601e351 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -22,6 +22,7 @@ TRACE_EVENT(damon_aggregated, __field(unsigned long, start) __field(unsigned long, end) __field(unsigned int, nr_accesses) + __field(unsigned int, age) ), TP_fast_assign( @@ -30,11 +31,13 @@ TRACE_EVENT(damon_aggregated, __entry->start = r->ar.start; __entry->end = r->ar.end; __entry->nr_accesses = r->nr_accesses; + __entry->age = r->age; ), - TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u", + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u", __entry->target_id, __entry->nr_regions, - __entry->start, __entry->end, __entry->nr_accesses) + __entry->start, __entry->end, + __entry->nr_accesses, __entry->age) ); #endif /* _TRACE_DAMON_H */ -- cgit From d720bbbd70e968f8a0257393b575c3a29b56f990 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:40 -0800 Subject: mm/damon/core: use abs() instead of diff_of() In kernel, we can use abs(a - b) to get the absolute value, So there is no need to redefine a new one. Link: https://lkml.kernel.org/r/b24e7b82d9efa90daf150d62dea171e19390ad0b.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: Muchun Song Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index e92497895202..04b8df7fd9e9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -750,8 +750,6 @@ static void damon_merge_two_regions(struct damon_target *t, damon_destroy_region(r, t); } -#define diff_of(a, b) (a > b ? a - b : b - a) - /* * Merge adjacent regions having similar access frequencies * @@ -765,13 +763,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { - if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres) + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; else r->age++; if (prev && prev->ar.end == r->ar.start && - diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + abs(prev->nr_accesses - r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else -- cgit From cdeed009f3bceee41f73f0137db785fd29a05cb8 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:44 -0800 Subject: mm/damon: remove some unneeded function definitions in damon.h In damon.h some func definitions about VA & PA can only be used in its own file, so there no need to define in the header file, and the header file will look cleaner. If other files later need these functions, the prototypes can be added to damon.h at that time. [sj@kernel.org: remove unnecessary function prototype position changes] Link: https://lkml.kernel.org/r/20211118114827.20052-1-sj@kernel.org Link: https://lkml.kernel.org/r/45fd5b3ef6cce8e28dbc1c92f9dc845ccfc949d7.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 21 --------------------- mm/damon/paddr.c | 11 ++++++----- mm/damon/vaddr.c | 18 ++++++++++-------- 3 files changed, 16 insertions(+), 34 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index b4d4be3cc987..1d1be348f506 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -461,34 +461,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ #ifdef CONFIG_DAMON_VADDR - -/* Monitoring primitives for virtual memory address spaces */ -void damon_va_init(struct damon_ctx *ctx); -void damon_va_update(struct damon_ctx *ctx); -void damon_va_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); -void damon_va_cleanup(struct damon_ctx *ctx); -int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_VADDR */ #ifdef CONFIG_DAMON_PADDR - -/* Monitoring primitives for the physical memory address space */ -void damon_pa_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); -int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_PADDR */ #endif /* _DAMON_H */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a496d6f203d6..4318134cbc4c 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -73,7 +73,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, damon_pa_mkold(r->sampling_addr); } -void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -192,7 +192,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -213,7 +213,7 @@ bool damon_pa_target_valid(void *t) return true; } -int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, +static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) { unsigned long addr; @@ -246,8 +246,9 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, return 0; } -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 73c5d1aafda6..a9d3b4d96e29 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -272,7 +272,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, } /* Initialize '->regions_list' of every target (task) */ -void damon_va_init(struct damon_ctx *ctx) +static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; @@ -292,7 +292,8 @@ void damon_va_init(struct damon_ctx *ctx) * * Returns true if it is. */ -static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) { return !(r->ar.end <= re->start || re->end <= r->ar.start); } @@ -356,7 +357,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, /* * Update regions for current memory mappings */ -void damon_va_update(struct damon_ctx *ctx) +static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; @@ -418,7 +419,7 @@ static void __damon_va_prepare_access_check(struct damon_ctx *ctx, damon_va_mkold(mm, r->sampling_addr); } -void damon_va_prepare_access_checks(struct damon_ctx *ctx) +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -539,7 +540,7 @@ static void __damon_va_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -603,7 +604,7 @@ out: } #endif /* CONFIG_ADVISE_SYSCALLS */ -int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, +static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) { int madv_action; @@ -633,8 +634,9 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, return damos_madvise(t, r, madv_action); } -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { -- cgit From 8bd0b9da03c9154e279b1a502636103887b9fbed Mon Sep 17 00:00:00 2001 From: Yihao Han Date: Fri, 14 Jan 2022 14:09:47 -0800 Subject: mm/damon/vaddr: remove swap_ranges() and replace it with swap() Remove 'swap_ranges()' and replace it with the macro 'swap()' defined in 'include/linux/minmax.h' to simplify code and improve efficiency Link: https://lkml.kernel.org/r/20211111115355.2808-1-hanyihao@vivo.com Signed-off-by: Yihao Han Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index a9d3b4d96e29..78ff2bcb66eb 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -98,16 +98,6 @@ static unsigned long sz_range(struct damon_addr_range *r) return r->end - r->start; } -static void swap_ranges(struct damon_addr_range *r1, - struct damon_addr_range *r2) -{ - struct damon_addr_range tmp; - - tmp = *r1; - *r1 = *r2; - *r2 = tmp; -} - /* * Find three regions separated by two biggest unmapped regions * @@ -146,9 +136,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma, gap.start = last_vma->vm_end; gap.end = vma->vm_start; if (sz_range(&gap) > sz_range(&second_gap)) { - swap_ranges(&gap, &second_gap); + swap(gap, second_gap); if (sz_range(&second_gap) > sz_range(&first_gap)) - swap_ranges(&second_gap, &first_gap); + swap(second_gap, first_gap); } next: last_vma = vma; @@ -159,7 +149,7 @@ next: /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) - swap_ranges(&first_gap, &second_gap); + swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); -- cgit From c89ae63eb0662b6c9f82dbfad3ef010239b8c1b1 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:50 -0800 Subject: mm/damon/schemes: add the validity judgment of thresholds In dbgfs "schemes" interface, i do some test like this: # cd /sys/kernel/debug/damon # echo "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" > schemes # cat schemes # 2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3 0 0 There have some unreasonable places, i set the valules of these variables " , , " as "<2, 1>, <2, 1>, <10, 1>, <1, 2, 3>. So there add a validity judgment for these thresholds value. Link: https://lkml.kernel.org/r/d78360e52158d786fcbf20bc62c96785742e76d3.1637239568.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index ad65436756af..bf36a2756cfb 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -213,6 +213,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if (!damos_action_valid(action)) goto fail; + if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + pos += parsed; scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, min_age, max_age, action, "a, &wmarks); -- cgit From 9b2a38d6ef25c1748e3964b0ff30a89e4ed26583 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:53 -0800 Subject: mm/damon: move damon_rand() definition into damon.h damon_rand() is called in three files:damon/core.c, damon/ paddr.c, damon/vaddr.c, i think there is no need to redefine this twice, So move it to damon.h will be a good choice. Link: https://lkml.kernel.org/r/20211202075859.51341-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 4 ++++ mm/damon/core.c | 4 ---- mm/damon/prmtv-common.h | 4 ---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1d1be348f506..3e91a597a1aa 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -11,12 +11,16 @@ #include #include #include +#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). diff --git a/mm/damon/core.c b/mm/damon/core.c index 04b8df7fd9e9..61e844d15b13 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -23,9 +22,6 @@ #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h index 61f27037603e..e790cb5f8fe0 100644 --- a/mm/damon/prmtv-common.h +++ b/mm/damon/prmtv-common.h @@ -6,10 +6,6 @@ */ #include -#include - -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) struct page *damon_get_page(unsigned long pfn); -- cgit From 234d68732b6c135087bdebfa0630a43ae8c27758 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:56 -0800 Subject: mm/damon: modify damon_rand() macro to static inline function damon_rand() cannot be implemented as a macro. Example: damon_rand(a++, b); The value of 'a' will be incremented twice, This is obviously unreasonable, So there fix it. Link: https://lkml.kernel.org/r/110ffcd4e420c86c42b41ce2bc9f0fe6a4f32cd3.1638795127.git.xhao@linux.alibaba.com Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions") Signed-off-by: Xin Hao Reported-by: Andrew Morton Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 3e91a597a1aa..e2c8152985b7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -19,7 +19,10 @@ #define DAMOS_MAX_SCORE (99) /* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) +static inline unsigned long damon_rand(unsigned long l, unsigned long r) +{ + return l + prandom_u32_max(r - l); +} /** * struct damon_addr_range - Represents an address region of [@start, @end). -- cgit From 88f86dcfa454784f7de550966c60fc78a3e95d6d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:09:59 -0800 Subject: mm/damon: convert macro functions to static inline functions Patch series "mm/damon: Misc cleanups". This patchset contains miscellaneous cleanups for DAMON's macro functions and documentation. This patch (of 6): This commit converts macro functions in DAMON to static inline functions, for better type checking, code documentation, etc[1]. [1] https://lore.kernel.org/linux-mm/20211202151213.6ec830863342220da4141bc5@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211209131806.19317-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211209131806.19317-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 18 ++++++++++++------ mm/damon/core.c | 5 ++++- mm/damon/vaddr.c | 6 ++++-- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index e2c8152985b7..2dbc1f545da2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -399,14 +399,20 @@ struct damon_ctx { struct list_head schemes; }; -#define damon_next_region(r) \ - (container_of(r->list.next, struct damon_region, list)) +static inline struct damon_region *damon_next_region(struct damon_region *r) +{ + return container_of(r->list.next, struct damon_region, list); +} -#define damon_prev_region(r) \ - (container_of(r->list.prev, struct damon_region, list)) +static inline struct damon_region *damon_prev_region(struct damon_region *r) +{ + return container_of(r->list.prev, struct damon_region, list); +} -#define damon_last_region(t) \ - (list_last_entry(&t->regions_list, struct damon_region, list)) +static inline struct damon_region *damon_last_region(struct damon_target *t) +{ + return list_last_entry(&t->regions_list, struct damon_region, list); +} #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 61e844d15b13..4515cf82c433 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -729,7 +729,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -#define sz_damon_region(r) (r->ar.end - r->ar.start) +static inline unsigned long sz_damon_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} /* * Merge two adjacent regions into one region diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 78ff2bcb66eb..68d9e4134816 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -26,8 +26,10 @@ * 't->id' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ -#define damon_get_task_struct(t) \ - (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task((struct pid *)t->id, PIDTYPE_PID); +} /* * Get the mm_struct of the given target -- cgit From 6322416b2d51f359efa7d875ab28bd195a5eb230 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:02 -0800 Subject: Docs/admin-guide/mm/damon/usage: update for scheme quotas and watermarks DAMOS features including time/space quota limits and watermarks are not described in the DAMON debugfs interface document. This commit updates the document for the features. Link: https://lkml.kernel.org/r/20211209131806.19317-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/usage.rst | 123 +++++++++++++++++++++------ 1 file changed, 98 insertions(+), 25 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index ed96bbf0daff..1ab9b714fca2 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -131,24 +131,38 @@ Schemes For usual DAMON-based data access aware memory management optimizations, users would simply want the system to apply a memory management action to a memory -region of a specific size having a specific access frequency for a specific -time. DAMON receives such formalized operation schemes from the user and -applies those to the target processes. It also counts the total number and -size of regions that each scheme is applied. This statistics can be used for -online analysis or tuning of the schemes. +region of a specific access pattern. DAMON receives such formalized operation +schemes from the user and applies those to the target processes. Users can get and set the schemes by reading from and writing to ``schemes`` debugfs file. Reading the file also shows the statistics of each scheme. To -the file, each of the schemes should be represented in each line in below form: +the file, each of the schemes should be represented in each line in below +form:: - min-size max-size min-acc max-acc min-age max-age action + -Note that the ranges are closed interval. Bytes for the size of regions -(``min-size`` and ``max-size``), number of monitored accesses per aggregate -interval for access frequency (``min-acc`` and ``max-acc``), number of -aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a -predefined integer for memory management actions should be used. The supported -numbers and their meanings are as below. +You can disable schemes by simply writing an empty string to the file. + +Target Access Pattern +~~~~~~~~~~~~~~~~~~~~~ + +The ```` is constructed with three ranges in below +form:: + + min-size max-size min-acc max-acc min-age max-age + +Specifically, bytes for the size of regions (``min-size`` and ``max-size``), +number of monitored accesses per aggregate interval for access frequency +(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of +regions (``min-age`` and ``max-age``) are specified. Note that the ranges are +closed interval. + +Action +~~~~~~ + +The ```` is a predefined integer for memory management actions, which +DAMON will apply to the regions having the target access pattern. The +supported numbers and their meanings are as below. - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - 1: Call ``madvise()`` for the region with ``MADV_COLD`` @@ -157,20 +171,79 @@ numbers and their meanings are as below. - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - 5: Do nothing but count the statistics -You can disable schemes by simply writing an empty string to the file. For -example, below commands applies a scheme saying "If a memory region of size in -[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate -interval in [10, 20], page out the region", check the entered scheme again, and -finally remove the scheme. :: +Quota +~~~~~ - # cd /damon - # echo "4096 8192 0 5 10 20 2" > schemes - # cat schemes - 4096 8192 0 5 10 20 2 0 0 - # echo > schemes +Optimal ``target access pattern`` for each ``action`` is workload dependent, so +not easy to find. Worse yet, setting a scheme of some action too aggressive +can cause severe overhead. To avoid such overhead, users can limit time and +size quota for the scheme via the ```` in below form:: + + + +This makes DAMON to try to use only up to ```` milliseconds for applying +the action to memory regions of the ``target access pattern`` within the +```` milliseconds, and to apply the action to only up to +```` bytes of memory regions within the ````. Setting both +```` and ```` zero disables the quota limits. + +When the quota limit is expected to be exceeded, DAMON prioritizes found memory +regions of the ``target access pattern`` based on their size, access frequency, +and age. For personalized prioritization, users can set the weights for the +three properties in ```` in below form:: + + + +Watermarks +~~~~~~~~~~ -The last two integers in the 4th line of above example is the total number and -the total size of the regions that the scheme is applied. +Some schemes would need to run based on current value of the system's specific +metrics like free memory ratio. For such cases, users can specify watermarks +for the condition.:: + + + +```` is a predefined integer for the metric to be checked. The +supported numbers and their meanings are as below. + + - 0: Ignore the watermarks + - 1: System's free memory rate (per thousand) + +The value of the metric is checked every ```` microseconds. + +If the value is higher than ```` or lower than ````, the +scheme is deactivated. If the value is lower than ````, the scheme +is activated. + +Statistics +~~~~~~~~~~ + +It also counts the total number and bytes of regions that each scheme is +applied. This statistics can be used for online analysis or tuning of the +schemes. + +The statistics can be shown by reading the ``schemes`` file. Reading the file +will show each scheme you entered in each line, and the two numbers for the +statistics will be added at the end of each line. + +Example +~~~~~~~ + +Below commands applies a scheme saying "If a memory region of size in [4KiB, +8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate +interval in [10, 20], page out the region. For the paging out, use only up to +10ms per second, and also don't page out more than 1GiB per second. Under the +limitation, page out memory regions having longer age first. Also, check the +free memory rate of the system every 5 seconds, start the monitoring and paging +out when the free memory rate becomes lower than 50%, but stop it if the free +memory rate becomes larger than 60%, or lower than 30%".:: + + # cd /damon + # scheme="4096 8192 0 5 10 20 2" # target access pattern and action + # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas + # scheme+=" 0 0 100" # prioritization weights + # scheme+=" 1 5000000 600 500 300" # watermarks + # echo "$scheme" > schemes Turning On/Off -- cgit From 35b43d4092008ad33d3bcccee4b262ffbf8a551c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:05 -0800 Subject: Docs/admin-guide/mm/damon/usage: remove redundant information DAMON usage document mentions DAMON user space tool and programming interface twice. This commit integrates those and remove unnecessary part. Link: https://lkml.kernel.org/r/20211209131806.19317-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/usage.rst | 44 ++++++++++++++-------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1ab9b714fca2..24137312f601 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -7,30 +7,30 @@ Detailed Usages DAMON provides below three interfaces for different users. - *DAMON user space tool.* - This is for privileged people such as system administrators who want a - just-working human-friendly interface. Using this, users can use the DAMON’s - major features in a human-friendly way. It may not be highly tuned for - special cases, though. It supports both virtual and physical address spaces - monitoring. + `This `_ is for privileged people such as + system administrators who want a just-working human-friendly interface. + Using this, users can use the DAMON’s major features in a human-friendly way. + It may not be highly tuned for special cases, though. It supports both + virtual and physical address spaces monitoring. For more detail, please + refer to its `usage document + `_. - *debugfs interface.* - This is for privileged user space programmers who want more optimized use of - DAMON. Using this, users can use DAMON’s major features by reading - from and writing to special debugfs files. Therefore, you can write and use - your personalized DAMON debugfs wrapper programs that reads/writes the - debugfs files instead of you. The DAMON user space tool is also a reference - implementation of such programs. It supports both virtual and physical - address spaces monitoring. + :ref:`This ` is for privileged user space programmers who + want more optimized use of DAMON. Using this, users can use DAMON’s major + features by reading from and writing to special debugfs files. Therefore, + you can write and use your personalized DAMON debugfs wrapper programs that + reads/writes the debugfs files instead of you. The `DAMON user space tool + `_ is one example of such programs. It + supports both virtual and physical address spaces monitoring. - *Kernel Space Programming Interface.* - This is for kernel space programmers. Using this, users can utilize every - feature of DAMON most flexibly and efficiently by writing kernel space - DAMON application programs for you. You can even extend DAMON for various - address spaces. - -Nevertheless, you could write your own user space tool using the debugfs -interface. A reference implementation is available at -https://github.com/awslabs/damo. If you are a kernel programmer, you could -refer to :doc:`/vm/damon/api` for the kernel space programming interface. For -the reason, this document describes only the debugfs interface + :doc:`This ` is for kernel space programmers. Using this, + users can utilize every feature of DAMON most flexibly and efficiently by + writing kernel space DAMON application programs for you. You can even extend + DAMON for various address spaces. For detail, please refer to the interface + :doc:`document `. + + +.. _debugfs_interface: debugfs Interface ================= -- cgit From 4492bf452af532493b6591d2e090a0f8f7c11674 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:08 -0800 Subject: Docs/admin-guide/mm/damon/usage: mention tracepoint at the beginning To get detailed monitoring results from the user space, users need to use the damon_aggregated tracepoint. This commit adds a brief mention of it at the beginning of the usage document. Link: https://lkml.kernel.org/r/20211209131806.19317-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/usage.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 24137312f601..846c85bf4b9d 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -21,7 +21,10 @@ DAMON provides below three interfaces for different users. you can write and use your personalized DAMON debugfs wrapper programs that reads/writes the debugfs files instead of you. The `DAMON user space tool `_ is one example of such programs. It - supports both virtual and physical address spaces monitoring. + supports both virtual and physical address spaces monitoring. Note that this + interface provides only simple :ref:`statistics ` for the + monitoring results. For detailed monitoring results, DAMON provides a + :ref:`tracepoint `. - *Kernel Space Programming Interface.* :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by @@ -215,6 +218,8 @@ If the value is higher than ```` or lower than ````, the scheme is deactivated. If the value is lower than ````, the scheme is activated. +.. _damos_stats: + Statistics ~~~~~~~~~~ @@ -268,6 +273,8 @@ the monitoring is turned on. If you write to the files while DAMON is running, an error code such as ``-EBUSY`` will be returned. +.. _tracepoint: + Tracepoint for Monitoring Results ================================= -- cgit From 995d739cde879a35ef6e890ecf80226b605ad36c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:11 -0800 Subject: Docs/admin-guide/mm/damon/usage: update for kdamond_pid and (mk|rm)_contexts The DAMON debugfs usage document is missing descriptions for 'kdamond_pid', 'mk_contexts', and 'rm_contexts' debugfs files. This commit adds those. Link: https://lkml.kernel.org/r/20211209131806.19317-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/usage.rst | 52 ++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 846c85bf4b9d..cb614c84ba9e 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -38,9 +38,9 @@ DAMON provides below three interfaces for different users. debugfs Interface ================= -DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``, -``schemes`` and ``monitor_on`` under its debugfs directory, -``/damon/``. +DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, +``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and +``rm_contexts`` under its debugfs directory, ``/damon/``. Attributes @@ -273,6 +273,52 @@ the monitoring is turned on. If you write to the files while DAMON is running, an error code such as ``-EBUSY`` will be returned. +Monitoring Thread PID +--------------------- + +DAMON does requested monitoring with a kernel thread called ``kdamond``. You +can get the pid of the thread by reading the ``kdamond_pid`` file. When the +monitoring is turned off, reading the file returns ``none``. :: + + # cd /damon + # cat monitor_on + off + # cat kdamond_pid + none + # echo on > monitor_on + # cat kdamond_pid + 18594 + + +Using Multiple Monitoring Threads +--------------------------------- + +One ``kdamond`` thread is created for each monitoring context. You can create +and remove monitoring contexts for multiple ``kdamond`` required use case using +the ``mk_contexts`` and ``rm_contexts`` files. + +Writing the name of the new context to the ``mk_contexts`` file creates a +directory of the name on the DAMON debugfs directory. The directory will have +DAMON debugfs files for the context. :: + + # cd /damon + # ls foo + # ls: cannot access 'foo': No such file or directory + # echo foo > mk_contexts + # ls foo + # attrs init_regions kdamond_pid schemes target_ids + +If the context is not needed anymore, you can remove it and the corresponding +directory by putting the name of the context to the ``rm_contexts`` file. :: + + # echo foo > rm_contexts + # ls foo + # ls: cannot access 'foo': No such file or directory + +Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the +root directory only. + + .. _tracepoint: Tracepoint for Monitoring Results -- cgit From f4c6d22c6cf282ef7d24a724b9bd978ee2b74fc6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:14 -0800 Subject: mm/damon: remove a mistakenly added comment for a future feature Due to a mistake in patches reordering, a comment for a future feature called 'arbitrary monitoring target support'[1], which is still under development, has added. Because it only introduces confusion and we don't have a plan to post the patches soon, this commit removes the mistakenly added part. [1] https://lore.kernel.org/linux-mm/20201215115448.25633-3-sjpark@amazon.com/ Link: https://lkml.kernel.org/r/20211209131806.19317-7-sj@kernel.org Fixes: 1f366e421c8f ("mm/damon/core: implement DAMON-based Operation Schemes (DAMOS)") Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 2dbc1f545da2..97f4a224e950 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -281,7 +281,7 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. + * to the region. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. -- cgit From 0e92c2ee9f459542c5384d9cfab24873c3dd6398 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:17 -0800 Subject: mm/damon/schemes: account scheme actions that successfully applied Patch series "mm/damon/schemes: Extend stats for better online analysis and tuning". To help online access pattern analysis and tuning of DAMON-based Operation Schemes (DAMOS), DAMOS provides simple statistics for each scheme. Introduction of DAMOS time/space quota further made the tuning easier by making the risk management easier. However, that also made understanding of the working schemes a little bit more difficult. For an example, progress of a given scheme can now be throttled by not only the aggressiveness of the target access pattern, but also the time/space quotas. So, when a scheme is showing unexpectedly slow progress, it's difficult to know by what the progress of the scheme is throttled, with currently provided statistics. This patchset extends the statistics to contain some metrics that can be helpful for such online schemes analysis and tuning (patches 1-2), exports those to users (patches 3 and 5), and add documents (patches 4 and 6). This patch (of 6): DAMON-based operation schemes (DAMOS) stats provide only the number and the amount of regions that the action of the scheme has tried to be applied. Because the action could be failed for some reasons, the currently provided information is sometimes not useful or convenient enough for schemes profiling and tuning. To improve this situation, this commit extends the DAMOS stats to provide the number and the amount of regions that the action has successfully applied. Link: https://lkml.kernel.org/r/20211210150016.35349-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211210150016.35349-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 28 +++++++++++++++++++++------- mm/damon/core.c | 13 ++++++++----- mm/damon/dbgfs.c | 2 +- mm/damon/paddr.c | 13 +++++++------ mm/damon/vaddr.c | 30 ++++++++++++++++-------------- 5 files changed, 53 insertions(+), 33 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 97f4a224e950..e0ad3d9aaeed 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -192,6 +192,20 @@ struct damos_watermarks { bool activated; }; +/** + * struct damos_stat - Statistics on a given scheme. + * @nr_tried: Total number of regions that the scheme is tried to be applied. + * @sz_tried: Total size of regions that the scheme is tried to be applied. + * @nr_applied: Total number of regions that the scheme is applied. + * @sz_applied: Total size of regions that the scheme is applied. + */ +struct damos_stat { + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -203,8 +217,7 @@ struct damos_watermarks { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. - * @stat_count: Total number of regions that this scheme is applied. - * @stat_sz: Total size of regions that this scheme is applied. + * @stat: Statistics of this scheme. * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the @@ -235,8 +248,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; - unsigned long stat_count; - unsigned long stat_sz; + struct damos_stat stat; struct list_head list; }; @@ -281,7 +293,8 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. + * to the region and return bytes of the region that the action is successfully + * applied. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -295,8 +308,9 @@ struct damon_primitive { int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); + unsigned long (*apply_scheme)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 4515cf82c433..d745bf28509f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -102,8 +102,7 @@ struct damos *damon_new_scheme( scheme->min_age_region = min_age_region; scheme->max_age_region = max_age_region; scheme->action = action; - scheme->stat_count = 0; - scheme->stat_sz = 0; + scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); scheme->quota.ms = quota->ms; @@ -574,6 +573,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, struct damos_quota *quota = &s->quota; unsigned long sz = r->ar.end - r->ar.start; struct timespec64 begin, end; + unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -627,7 +627,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_split_region_at(c, t, r, sz); } ktime_get_coarse_ts64(&begin); - c->primitive.apply_scheme(c, t, r, s); + sz_applied = c->primitive.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -641,8 +641,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c, r->age = 0; update_stat: - s->stat_count++; - s->stat_sz += sz; + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; } } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index bf36a2756cfb..9318b52d0b46 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -117,7 +117,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_age, s->wmarks.metric, s->wmarks.interval, s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat_count, s->stat_sz); + s->stat.nr_tried, s->stat.sz_tried); if (!rc) return -ENOMEM; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 4318134cbc4c..5e8244f65a1a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -213,14 +213,15 @@ bool damon_pa_target_valid(void *t) return true; } -static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { - unsigned long addr; + unsigned long addr, applied; LIST_HEAD(page_list); if (scheme->action != DAMOS_PAGEOUT) - return -EINVAL; + return 0; for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); @@ -241,9 +242,9 @@ static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, put_page(page); } } - reclaim_pages(&page_list); + applied = reclaim_pages(&page_list); cond_resched(); - return 0; + return applied * PAGE_SIZE; } static int damon_pa_scheme_score(struct damon_ctx *context, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 68d9e4134816..a10df3fd3d02 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -572,32 +572,34 @@ bool damon_va_target_valid(void *target) } #ifndef CONFIG_ADVISE_SYSCALLS -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { - return -EINVAL; + return 0; } #else -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { struct mm_struct *mm; - int ret = -ENOMEM; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long applied; mm = damon_get_mm(target); if (!mm) - goto out; + return 0; - ret = do_madvise(mm, PAGE_ALIGN(r->ar.start), - PAGE_ALIGN(r->ar.end - r->ar.start), behavior); + applied = do_madvise(mm, start, len, behavior) ? 0 : len; mmput(mm); -out: - return ret; + + return applied; } #endif /* CONFIG_ADVISE_SYSCALLS */ -static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { int madv_action; @@ -620,7 +622,7 @@ static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, case DAMOS_STAT: return 0; default: - return -EINVAL; + return 0; } return damos_madvise(t, r, madv_action); -- cgit From 6268eac34ca30af7f6313504d556ec7fcd295621 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:20 -0800 Subject: mm/damon/schemes: account how many times quota limit has exceeded If the time/space quotas of a given DAMON-based operation scheme is too small, the scheme could show unexpectedly slow progress. However, there is no good way to notice the case in runtime. This commit extends the DAMOS stat to provide how many times the quota limits exceeded so that the users can easily notice the case and tune the scheme. Link: https://lkml.kernel.org/r/20211210150016.35349-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ mm/damon/core.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index e0ad3d9aaeed..af648388e759 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -198,12 +198,14 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @qt_exceeds: Total number of times the quota of the scheme has exceeded. */ struct damos_stat { unsigned long nr_tried; unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long qt_exceeds; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index d745bf28509f..d5120b326e1b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -693,6 +693,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies( quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; -- cgit From 60e52e7c46a127bca5ddd48b89002564f3862063 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:23 -0800 Subject: mm/damon/reclaim: provide reclamation statistics This implements new DAMON_RECLAIM parameters for statistics reporting. Those can be used for understanding how DAMON_RECLAIM is working, and for tuning the other parameters. Link: https://lkml.kernel.org/r/20211210150016.35349-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/reclaim.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index dc1485044eaf..bc476cef688e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); +/* + * Number of memory regions that tried to be reclaimed. + */ +static unsigned long nr_reclaim_tried_regions __read_mostly; +module_param(nr_reclaim_tried_regions, ulong, 0400); + +/* + * Total bytes of memory regions that tried to be reclaimed. + */ +static unsigned long bytes_reclaim_tried_regions __read_mostly; +module_param(bytes_reclaim_tried_regions, ulong, 0400); + +/* + * Number of memory regions that successfully be reclaimed. + */ +static unsigned long nr_reclaimed_regions __read_mostly; +module_param(nr_reclaimed_regions, ulong, 0400); + +/* + * Total bytes of memory regions that successfully be reclaimed. + */ +static unsigned long bytes_reclaimed_regions __read_mostly; +module_param(bytes_reclaimed_regions, ulong, 0400); + +/* + * Number of times that the time/space quota limits have exceeded + */ +static unsigned long nr_quota_exceeds __read_mostly; +module_param(nr_quota_exceeds, ulong, 0400); + static struct damon_ctx *ctx; static struct damon_target *target; @@ -333,6 +363,21 @@ static void damon_reclaim_timer_fn(struct work_struct *work) } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + nr_reclaim_tried_regions = s->stat.nr_tried; + bytes_reclaim_tried_regions = s->stat.sz_tried; + nr_reclaimed_regions = s->stat.nr_applied; + bytes_reclaimed_regions = s->stat.sz_applied; + nr_quota_exceeds = s->stat.qt_exceeds; + } + return 0; +} + static int __init damon_reclaim_init(void) { ctx = damon_new_ctx(); @@ -340,6 +385,7 @@ static int __init damon_reclaim_init(void) return -ENOMEM; damon_pa_set_primitives(ctx); + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; /* 4242 means nothing but fun */ target = damon_new_target(4242); -- cgit From 81f0895f1f5ed0d2bb80559ba9fbc6ce814e7235 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:26 -0800 Subject: Docs/admin-guide/mm/damon/reclaim: document statistics parameters This adds descriptions for the DAMON_RECLAIM statistics parameters. Link: https://lkml.kernel.org/r/20211210150016.35349-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/reclaim.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index fb9def3a7355..0af51a9705b1 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -208,6 +208,31 @@ PID of the DAMON thread. If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else, -1. +nr_reclaim_tried_regions +------------------------ + +Number of memory regions that tried to be reclaimed by DAMON_RECLAIM. + +bytes_reclaim_tried_regions +--------------------------- + +Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM. + +nr_reclaimed_regions +-------------------- + +Number of memory regions that successfully be reclaimed by DAMON_RECLAIM. + +bytes_reclaimed_regions +----------------------- + +Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM. + +nr_quota_exceeds +---------------- + +Number of times that the time/space quota limits have exceeded. + Example ======= -- cgit From 3a619fdb8de8a3ecd4200e7d183d2c8ceb32289e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:29 -0800 Subject: mm/damon/dbgfs: support all DAMOS stats Currently, DAMON debugfs interface is not supporting DAMON-based Operation Schemes (DAMOS) stats for schemes successfully applied regions and time/space quota limit exceeds. This adds the support. Link: https://lkml.kernel.org/r/20211210150016.35349-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 9318b52d0b46..751c7b835684 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, @@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_age, s->wmarks.metric, s->wmarks.interval, s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat.nr_tried, s->stat.sz_tried); + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); if (!rc) return -ENOMEM; -- cgit From dbcb9b9f954f71fb46be34af624c9edaaa171414 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:32 -0800 Subject: Docs/admin-guide/mm/damon/usage: update for schemes statistics This updates DAMON debugfs interface for statistics of schemes successfully applied regions and time/space quota limit exceeds counts. Link: https://lkml.kernel.org/r/20211210150016.35349-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/damon/usage.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index cb614c84ba9e..59b84904a854 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -223,12 +223,13 @@ is activated. Statistics ~~~~~~~~~~ -It also counts the total number and bytes of regions that each scheme is -applied. This statistics can be used for online analysis or tuning of the -schemes. +It also counts the total number and bytes of regions that each scheme is tried +to be applied, the two numbers for the regions that each scheme is successfully +applied, and the total number of the quota limit exceeds. This statistics can +be used for online analysis or tuning of the schemes. The statistics can be shown by reading the ``schemes`` file. Reading the file -will show each scheme you entered in each line, and the two numbers for the +will show each scheme you entered in each line, and the five numbers for the statistics will be added at the end of each line. Example -- cgit From 49f4203aae06ba9d67b500c90339b262b0a52637 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:10:35 -0800 Subject: mm/damon: add access checking for hugetlb pages The process's VMAs can be mapped by hugetlb page, but now the DAMON did not implement the access checking for hugetlb pte, so we can not get the actual access count like below if a process VMAs were mapped by hugetlb. damon_aggregated: target_id=18446614368406014464 nr_regions=12 4194304-5476352: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662370467840-140662372970496: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662372970496-140662375460864: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662375460864-140662377951232: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662377951232-140662380449792: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662380449792-140662382944256: 0 545 ...... Thus this patch adds hugetlb access checking support, with this patch we can see below VMA mapped by hugetlb access count. damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296486649856-140296489914368: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296489914368-140296492978176: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296492978176-140296495439872: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296495439872-140296498311168: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296498311168-140296501198848: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296501198848-140296504320000: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296504320000-140296507568128: 1 2 ...... [baolin.wang@linux.alibaba.com: fix unused var warning] Link: https://lkml.kernel.org/r/1aaf9c11-0d8e-b92d-5c92-46e50a6e8d4e@linux.alibaba.com [baolin.wang@linux.alibaba.com: v3] Link: https://lkml.kernel.org/r/486927ecaaaecf2e3a7fbe0378ec6e1c58b50747.1640852276.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/6afcbd1fda5f9c7c24f320d26a98188c727ceec3.1639623751.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Cc: Mike Kravetz Cc: Randy Dunlap Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index a10df3fd3d02..ee465b380612 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -388,8 +388,65 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + if (!page) + return; + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + huge_ptep_set_access_flags(vma, addr, pte, entry, + vma->vm_flags & VM_WRITE); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -484,8 +541,47 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + if (!page) + goto out; + + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = huge_page_size(h); + priv->young = true; + } + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, -- cgit From 2cd4b8e10cc31eadb5b10b1d73b3f28156f3776c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jan 2022 14:10:38 -0800 Subject: mm/damon: move the implementation of damon_insert_region to damon.h Usually, inline function is declared static since it should sit between storage and type. And implement it in a header file if used by multiple files. And this change also fixes compile issue when backport damon to 5.10. mm/damon/vaddr.c: In function `damon_va_evenly_split_region': ./include/linux/damon.h:425:13: error: inlining failed in call to `always_inline' `damon_insert_region': function body not available 425 | inline void damon_insert_region(struct damon_region *r, | ^~~~~~~~~~~~~~~~~~~ mm/damon/vaddr.c:86:3: note: called from here 86 | damon_insert_region(n, r, next, t); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20211223085703.6142-1-guoqing.jiang@linux.dev Signed-off-by: Guoqing Jiang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 13 +++++++++++-- mm/damon/core.c | 11 ----------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index af648388e759..5e1e3a128b77 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -451,9 +451,18 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -inline void damon_insert_region(struct damon_region *r, + +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, struct damon_region *prev, struct damon_region *next, - struct damon_target *t); + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); diff --git a/mm/damon/core.c b/mm/damon/core.c index d5120b326e1b..6482d510dcbe 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -49,17 +49,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) return region; } -/* - * Add a region between two other regions - */ -inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; -} - void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); -- cgit From 70b8480812d0a3930049a44820a1fa149b090c10 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:41 -0800 Subject: mm/damon/dbgfs: remove an unnecessary variable Patch series "mm/damon: Hide unnecessary information disclosures". DAMON is exposing some unnecessary information including kernel pointer in kernel log and tracepoint. This patchset hides such information. The first patch is only for a trivial cleanup, though. This patch (of 4): This commit removes a unnecessarily used variable in dbgfs_target_ids_write(). Link: https://lkml.kernel.org/r/20211229131016.23641-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211229131016.23641-2-sj@kernel.org Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 751c7b835684..5b899601e56c 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -364,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, struct damon_ctx *ctx = file->private_data; struct damon_target *t, *next_t; bool id_is_pid = true; - char *kbuf, *nrs; + char *kbuf; unsigned long *targets; ssize_t nr_targets; ssize_t ret; @@ -374,14 +374,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - nrs = kbuf; if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; /* target id is meaningless here, but we set it just for fun */ scnprintf(kbuf, count, "42 "); } - targets = str_to_target_ids(nrs, count, &nr_targets); + targets = str_to_target_ids(kbuf, count, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; -- cgit From 251403f19aab6a122f4dcfb14149814e85564202 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:44 -0800 Subject: mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging Failure of 'damon_va_three_regions()' is logged using 'pr_err()'. But, the function can fail in legal situations. To avoid making users be surprised and to keep the kernel clean, this makes the log to be printed using 'pr_debug()'. Link: https://lkml.kernel.org/r/20211229131016.23641-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index ee465b380612..223829655d64 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -238,7 +238,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, int i; if (damon_va_three_regions(t, regions)) { - pr_err("Failed to get three regions of target %lu\n", t->id); + pr_debug("Failed to get three regions of target %lu\n", t->id); return; } -- cgit From 962fe7a6b1b2f9deb1b31b3344afa3b11afdf7ab Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:47 -0800 Subject: mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log The failure log message for 'damon_va_three_regions()' prints the target id, which is a 'struct pid' pointer in the case. To avoid exposing the kernel pointer via the log, this makes the log to use the index of the target in the context's targets list instead. Link: https://lkml.kernel.org/r/20211229131016.23641-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 223829655d64..89b6468da2b9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -232,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t, static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { + struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; - int i; + int i, tidx = 0; if (damon_va_three_regions(t, regions)) { - pr_debug("Failed to get three regions of target %lu\n", t->id); + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); return; } -- cgit From 76fd0285b447991267e838842c0be7395eb454bb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:50 -0800 Subject: mm/damon: hide kernel pointer from tracepoint event DAMON's virtual address spaces monitoring primitive uses 'struct pid *' of the target process as its monitoring target id. The kernel address is exposed as-is to the user space via the DAMON tracepoint, 'damon_aggregated'. Though primarily only privileged users are allowed to access that, it would be better to avoid unnecessarily exposing kernel pointers so. Because the trace result is only required to be able to distinguish each target, we aren't need to use the pointer as-is. This makes the tracepoint to use the index of the target in the context's targets list as its id in the tracepoint, to hide the kernel space address. Link: https://lkml.kernel.org/r/20211229131016.23641-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/damon.h | 8 ++++---- mm/damon/core.c | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 99ffa601e351..c79f1d4c39af 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -11,10 +11,10 @@ TRACE_EVENT(damon_aggregated, - TP_PROTO(struct damon_target *t, struct damon_region *r, - unsigned int nr_regions), + TP_PROTO(struct damon_target *t, unsigned int target_id, + struct damon_region *r, unsigned int nr_regions), - TP_ARGS(t, r, nr_regions), + TP_ARGS(t, target_id, r, nr_regions), TP_STRUCT__entry( __field(unsigned long, target_id) @@ -26,7 +26,7 @@ TRACE_EVENT(damon_aggregated, ), TP_fast_assign( - __entry->target_id = t->id; + __entry->target_id = target_id; __entry->nr_regions = nr_regions; __entry->start = r->ar.start; __entry->end = r->ar.end; diff --git a/mm/damon/core.c b/mm/damon/core.c index 6482d510dcbe..1dd153c31c9e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -514,15 +514,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; + unsigned int ti = 0; /* target's index */ damon_for_each_target(t, c) { struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, r, damon_nr_regions(t)); + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } + ti++; } } -- cgit