From e72590fa56b73b99885b17c74017b6cd4355bf66 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2023 11:38:50 +0200 Subject: sh: mm: re-add lost __ref to ioremap_prot() to fix modpost warning When __ioremap_caller() was replaced by ioremap_prot(), the __ref annotation added in commit af1415314a4190b8 ("sh: Flag __ioremap_caller() __init_refok.") was removed, causing a modpost warning: WARNING: modpost: vmlinux: section mismatch in reference: ioremap_prot+0x88 (section: .text) -> ioremap_fixed (section: .init.text) ioremap_prot() calls ioremap_fixed() (which is marked __init), but only before mem_init_done becomes true, so this is safe. Hence fix this by re-adding the lost __ref. Link: https://lkml.kernel.org/r/20230911093850.1517389-1-geert+renesas@glider.be Fixes: 0453c9a78015cb22 ("sh: mm: convert to GENERIC_IOREMAP") Signed-off-by: Geert Uytterhoeven Reviewed-by: Baoquan He Reviewed-by: John Paul Adrian Glaubitz Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index c33b3daa4ad1..33d20f34560f 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -72,8 +72,8 @@ __ioremap_29bit(phys_addr_t offset, unsigned long size, pgprot_t prot) #define __ioremap_29bit(offset, size, prot) NULL #endif /* CONFIG_29BIT */ -void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) +void __iomem __ref *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { void __iomem *mapped; pgprot_t pgprot = __pgprot(prot); -- cgit From 7b086755fb8cdbb6b3e45a1bbddc00e7f9b1dc03 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 11 Sep 2023 14:11:08 -0400 Subject: mm: page_alloc: fix CMA and HIGHATOMIC landing on the wrong buddy list Commit 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") bypasses the pcplist on lock contention and returns the page directly to the buddy list of the page's migratetype. For pages that don't have their own pcplist, such as CMA and HIGHATOMIC, the migratetype is temporarily updated such that the page can hitch a ride on the MOVABLE pcplist. Their true type is later reassessed when flushing in free_pcppages_bulk(). However, when lock contention is detected after the type was already overridden, the bypass will then put the page on the wrong buddy list. Once on the MOVABLE buddy list, the page becomes eligible for fallbacks and even stealing. In the case of HIGHATOMIC, otherwise ineligible allocations can dip into the highatomic reserves. In the case of CMA, the page can be lost from the CMA region permanently. Use a separate pcpmigratetype variable for the pcplist override. Use the original migratetype when going directly to the buddy. This fixes the bug and should make the intentions more obvious in the code. Originally sent here to address the HIGHATOMIC case: https://lore.kernel.org/lkml/20230821183733.106619-4-hannes@cmpxchg.org/ Changelog updated in response to the CMA-specific bug report. [mgorman@techsingularity.net: updated changelog] Link: https://lkml.kernel.org/r/20230911181108.GA104295@cmpxchg.org Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") Signed-off-by: Johannes Weiner Reported-by: Joe Liu Reviewed-by: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c5be12f9336..95546f376302 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2400,7 +2400,7 @@ void free_unref_page(struct page *page, unsigned int order) struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); - int migratetype; + int migratetype, pcpmigratetype; if (!free_unref_page_prepare(page, pfn, order)) return; @@ -2408,24 +2408,24 @@ void free_unref_page(struct page *page, unsigned int order) /* * We only track unmovable, reclaimable and movable on pcp lists. * Place ISOLATE pages on the isolated list because they are being - * offlined but treat HIGHATOMIC as movable pages so we can get those - * areas back if necessary. Otherwise, we may have to free + * offlined but treat HIGHATOMIC and CMA as movable pages so we can + * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = get_pcppage_migratetype(page); + migratetype = pcpmigratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } - migratetype = MIGRATE_MOVABLE; + pcpmigratetype = MIGRATE_MOVABLE; } zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); -- cgit From 4653e5dd04cb869526477a76b87d0aa1a5c65101 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 Sep 2023 14:24:06 -0600 Subject: task_work: add kerneldoc annotation for 'data' argument A previous commit changed the arguments to task_work_cancel_match(), but didn't document all of them. Link: https://lkml.kernel.org/r/93938bff-baa3-4091-85f5-784aae297a07@kernel.dk Fixes: c7aab1a7c52b ("task_work: add helper for more targeted task_work canceling") Signed-off-by: Jens Axboe Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202309120307.zis3yQGe-lkp@intel.com/ Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/task_work.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/task_work.c b/kernel/task_work.c index 065e1ef8fc8d..95a7e1b7f1da 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call + * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. -- cgit From c652df8a4a9d7853fa1100b244024fd6f1a9c18a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 12 Sep 2023 14:50:48 +0100 Subject: selftests: link libasan statically for tests with -fsanitize=address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When dynamically linking, Address Sanitizer requires its library to be the first one to be loaded; this is apparently to ensure that every call to malloc is intercepted. If using LD_PRELOAD, those listed libraries will be loaded before the libraries listed in the program's ELF and will therefore violate this requirement, leading to the below failure and output from ASan. commit 58e2847ad2e6 ("selftests: line buffer test program's stdout") modified the kselftest runner to force line buffering by forcing the test programs to run through `stdbuf`. It turns out that stdbuf implements line buffering by injecting a library via LD_PRELOAD. Therefore selftests that use ASan started failing. Fix this by statically linking libasan in the affected test programs, using the `-static-libasan` option. Note this is already the default for Clang, but not got GCC. Test output sample for failing case: TAP version 13 1..3 # timeout set to 300 # selftests: openat2: openat2_test # ==4052==ASan runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD. not ok 1 selftests: openat2: openat2_test # exit=1 # timeout set to 300 # selftests: openat2: resolve_test # ==4070==ASan runtime does not come first in initial library list; you should either link runtime to your application or manually preload it with LD_PRELOAD. not ok 2 selftests: openat2: resolve_test # exit=1 Link: https://lkml.kernel.org/r/20230912135048.1755771-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Fixes: 58e2847ad2e6 ("selftests: line buffer test program's stdout") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309121342.97e2f008-oliver.sang@intel.com Cc: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Peter Xu Cc: Shuah Khan Cc: Tom Rix Signed-off-by: Andrew Morton --- tools/testing/selftests/fchmodat2/Makefile | 2 +- tools/testing/selftests/openat2/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/fchmodat2/Makefile b/tools/testing/selftests/fchmodat2/Makefile index 20839f8e43f2..71ec34bf1501 100644 --- a/tools/testing/selftests/fchmodat2/Makefile +++ b/tools/testing/selftests/fchmodat2/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined $(KHDR_INCLUDES) +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan $(KHDR_INCLUDES) TEST_GEN_PROGS := fchmodat2_test include ../lib.mk diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile index 843ba56d8e49..254d676a2689 100644 --- a/tools/testing/selftests/openat2/Makefile +++ b/tools/testing/selftests/openat2/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -static-libasan TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test include ../lib.mk -- cgit From 493d4eecf45d15bb1850832f5f5ece2556308646 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 12 Sep 2023 09:19:10 -0700 Subject: revert "scripts/gdb/symbols: add specific ko module load command" Revert 11f956538c07 ("scripts/gdb/symbols: add specific ko module load command") due to breakage identified by Johannes Berg in [1]. Fixes: 11f956538c07 ("scripts/gdb/symbols: add specific ko module load command") Reported-by: Johannes Berg Closes: https://lkml.kernel.org/r/c44b748307a074d0c250002cdcfe209b8cce93c9.camel@sipsolutions.net [1] Cc: AngeloGioacchino Del Regno Cc: Chinwen Chang Cc: Jan Kiszka Cc: Kieran Bingham Cc: Kuan-Ying Lee Cc: Matthias Brugger Cc: Qun-Wei Lin Signed-off-by: Andrew Morton --- scripts/gdb/linux/symbols.py | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 5179edd1b627..c8047f4441e6 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -111,12 +111,11 @@ lx-symbols command.""" return "{textaddr} {sections}".format( textaddr=textaddr, sections="".join(args)) - def load_module_symbols(self, module, module_file=None): + def load_module_symbols(self, module): module_name = module['name'].string() module_addr = str(module['mem'][constants.LX_MOD_TEXT]['base']).split()[0] - if not module_file: - module_file = self._get_module_file(module_name) + module_file = self._get_module_file(module_name) if not module_file and not self.module_files_updated: self._update_module_files() module_file = self._get_module_file(module_name) @@ -139,19 +138,6 @@ lx-symbols command.""" else: gdb.write("no module object found for '{0}'\n".format(module_name)) - def load_ko_symbols(self, mod_path): - self.loaded_modules = [] - module_list = modules.module_list() - - for module in module_list: - module_name = module['name'].string() - module_pattern = ".*/{0}\.ko(?:.debug)?$".format( - module_name.replace("_", r"[_\-]")) - if re.match(module_pattern, mod_path) and os.path.exists(mod_path): - self.load_module_symbols(module, mod_path) - return - raise gdb.GdbError("%s is not a valid .ko\n" % mod_path) - def load_all_symbols(self): gdb.write("loading vmlinux\n") @@ -190,11 +176,6 @@ lx-symbols command.""" self.module_files = [] self.module_files_updated = False - argv = gdb.string_to_argv(arg) - if len(argv) == 1: - self.load_ko_symbols(argv[0]) - return - self.load_all_symbols() if hasattr(gdb, 'Breakpoint'): -- cgit From 9d1be94df5acd486b157e85ffa53d344e5b17e22 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Sep 2023 14:10:12 +0300 Subject: selftests/proc: fixup proc-empty-vm test after KSM changes /proc/${pid}/smaps_rollup is not empty file even if process's address space is empty, update the test. Link: https://lkml.kernel.org/r/725e041f-e9df-4f3d-b267-d4cd2774a78d@p183 Signed-off-by: Alexey Dobriyan Cc: David Hildenbrand Cc: Stefan Roesch Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-empty-vm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c index b16c13688b88..ee71ce52cb6a 100644 --- a/tools/testing/selftests/proc/proc-empty-vm.c +++ b/tools/testing/selftests/proc/proc-empty-vm.c @@ -267,6 +267,7 @@ static const char g_smaps_rollup[] = "Private_Dirty: 0 kB\n" "Referenced: 0 kB\n" "Anonymous: 0 kB\n" +"KSM: 0 kB\n" "LazyFree: 0 kB\n" "AnonHugePages: 0 kB\n" "ShmemPmdMapped: 0 kB\n" -- cgit From c80da1fb85bf50decf0cc9803fbf9b0b926268f8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Sep 2023 23:08:48 -0700 Subject: scatterlist: add missing function params to kernel-doc Describe missing function parameters to prevent kernel-doc warnings: lib/scatterlist.c:288: warning: Function parameter or member 'first_chunk' not described in '__sg_alloc_table' lib/scatterlist.c:800: warning: Function parameter or member 'flags' not described in 'sg_miter_start' Link: https://lkml.kernel.org/r/20230912060848.4673-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- lib/scatterlist.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index c65566b4dc66..68b45c82c37a 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -265,7 +265,8 @@ EXPORT_SYMBOL(sg_free_table); * @table: The sg table header to use * @nents: Number of entries in sg list * @max_ents: The maximum number of entries the allocator returns per call - * @nents_first_chunk: Number of entries int the (preallocated) first + * @first_chunk: first SGL if preallocated (may be %NULL) + * @nents_first_chunk: Number of entries in the (preallocated) first * scatterlist chunk, 0 means no such preallocated chunk provided by user * @gfp_mask: GFP allocation mask * @alloc_fn: Allocator to use @@ -788,6 +789,7 @@ EXPORT_SYMBOL(__sg_page_iter_dma_next); * @miter: sg mapping iter to be started * @sgl: sg list to iterate over * @nents: number of sg entries + * @flags: sg iterator flags * * Description: * Starts mapping iterator @miter. -- cgit From 36ee98b555c00c5b360d9cd63dce490f4dac2290 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Sep 2023 23:08:38 -0700 Subject: argv_split: fix kernel-doc warnings Use proper kernel-doc notation to prevent build warnings: lib/argv_split.c:36: warning: Function parameter or member 'argv' not described in 'argv_free' lib/argv_split.c:61: warning: No description found for return value of 'argv_split' Link: https://lkml.kernel.org/r/20230912060838.3794-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- lib/argv_split.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/argv_split.c b/lib/argv_split.c index 1a19a0a93dc1..e28db8e3b58c 100644 --- a/lib/argv_split.c +++ b/lib/argv_split.c @@ -28,7 +28,7 @@ static int count_argc(const char *str) /** * argv_free - free an argv - * @argv - the argument vector to be freed + * @argv: the argument vector to be freed * * Frees an argv and the strings it points to. */ @@ -46,7 +46,7 @@ EXPORT_SYMBOL(argv_free); * @str: the string to be split * @argcp: returned argument count * - * Returns an array of pointers to strings which are split out from + * Returns: an array of pointers to strings which are split out from * @str. This is performed by strictly splitting on white-space; no * quote processing is performed. Multiple whitespace characters are * considered to be a single argument separator. The returned array -- cgit From 0c7752d5b1ac62c8b926c907f34073ef7e9ad42b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Sep 2023 23:08:22 -0700 Subject: pidfd: prevent a kernel-doc warning Change the comment to match the function name that the SYSCALL_DEFINE() macros generate to prevent a kernel-doc warning. kernel/pid.c:628: warning: expecting prototype for pidfd_open(). Prototype was for sys_pidfd_open() instead Link: https://lkml.kernel.org/r/20230912060822.2500-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Christian Brauner Signed-off-by: Andrew Morton --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/pid.c b/kernel/pid.c index fee14a4486a3..6500ef956f2f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags) } /** - * pidfd_open() - Open new pid file descriptor. + * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass -- cgit From 9ea9cb00a82b53ec39630eac718776d37e41b35a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Sep 2023 11:21:39 -0400 Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement Breno and Josef report a deadlock scenario from cgroup reclaim re-entering the filesystem: [ 361.546690] ====================================================== [ 361.559210] WARNING: possible circular locking dependency detected [ 361.571703] 6.5.0-0_fbk700_debug_rc0_kbuilder_13159_gbf787a128001 #1 Tainted: G S E [ 361.589704] ------------------------------------------------------ [ 361.602277] find/9315 is trying to acquire lock: [ 361.611625] ffff88837ba140c0 (&delayed_node->mutex){+.+.}-{4:4}, at: __btrfs_release_delayed_node+0x68/0x4f0 [ 361.631437] [ 361.631437] but task is already holding lock: [ 361.643243] ffff8881765b8678 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock+0x1e/0x40 [ 362.904457] mutex_lock_nested+0x1c/0x30 [ 362.912414] __btrfs_release_delayed_node+0x68/0x4f0 [ 362.922460] btrfs_evict_inode+0x301/0x770 [ 362.982726] evict+0x17c/0x380 [ 362.988944] prune_icache_sb+0x100/0x1d0 [ 363.005559] super_cache_scan+0x1f8/0x260 [ 363.013695] do_shrink_slab+0x2a2/0x540 [ 363.021489] shrink_slab_memcg+0x237/0x3d0 [ 363.050606] shrink_slab+0xa7/0x240 [ 363.083382] shrink_node_memcgs+0x262/0x3b0 [ 363.091870] shrink_node+0x1a4/0x720 [ 363.099150] shrink_zones+0x1f6/0x5d0 [ 363.148798] do_try_to_free_pages+0x19b/0x5e0 [ 363.157633] try_to_free_mem_cgroup_pages+0x266/0x370 [ 363.190575] reclaim_high+0x16f/0x1f0 [ 363.208409] mem_cgroup_handle_over_high+0x10b/0x270 [ 363.246678] try_charge_memcg+0xaf2/0xc70 [ 363.304151] charge_memcg+0xf0/0x350 [ 363.320070] __mem_cgroup_charge+0x28/0x40 [ 363.328371] __filemap_add_folio+0x870/0xd50 [ 363.371303] filemap_add_folio+0xdd/0x310 [ 363.399696] __filemap_get_folio+0x2fc/0x7d0 [ 363.419086] pagecache_get_page+0xe/0x30 [ 363.427048] alloc_extent_buffer+0x1cd/0x6a0 [ 363.435704] read_tree_block+0x43/0xc0 [ 363.443316] read_block_for_search+0x361/0x510 [ 363.466690] btrfs_search_slot+0xc8c/0x1520 This is caused by the mem_cgroup_handle_over_high() not respecting the gfp_mask of the allocation context. We used to only call this function on resume to userspace, where no locks were held. But c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") added a call from the allocation context without considering the gfp. Link: https://lkml.kernel.org/r/20230914152139.100822-1-hannes@cmpxchg.org Fixes: c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") Signed-off-by: Johannes Weiner Reported-by: Breno Leitao Reported-by: Josef Bacik Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: [5.17+] Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- include/linux/resume_user_mode.h | 2 +- mm/memcontrol.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab94ad4597d0..e4e24da16d2c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -920,7 +920,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(void); +void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); @@ -1458,7 +1458,7 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } -static inline void mem_cgroup_handle_over_high(void) +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index 285189454449..f8f3e958e9cf 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -55,7 +55,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) } #endif - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4d3282493b6..d13dde2f8b56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2555,7 +2555,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. */ -void mem_cgroup_handle_over_high(void) +void mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2583,7 +2583,7 @@ retry_reclaim: */ nr_reclaimed = reclaim_high(memcg, in_retry ? SWAP_CLUSTER_MAX : nr_pages, - GFP_KERNEL); + gfp_mask); /* * memory.high is breached and reclaim is unable to keep up. Throttle @@ -2819,7 +2819,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) { - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(gfp_mask); } return 0; } -- cgit From 578d7699e5c2add8c2e9549d9d75dfb56c460cb3 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 14 Sep 2023 12:30:20 -0400 Subject: proc: nommu: /proc//maps: release mmap read lock The no-MMU implementation of /proc//map doesn't normally release the mmap read lock, because it uses !IS_ERR_OR_NULL(_vml) to determine whether to release the lock. Since _vml is NULL when the end of the mappings is reached, the lock is not released. Reading /proc/1/maps twice doesn't cause a hang because it only takes the read lock, which can be taken multiple times and therefore doesn't show any problem if the lock isn't released. Instead, you need to perform some operation that attempts to take the write lock after reading /proc//maps. To actually reproduce the bug, compile the following code as 'proc_maps_bug': #include #include #include int main(int argc, char *argv[]) { void *buf; sleep(1); buf = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); puts("mmap returned"); return 0; } Then, run: ./proc_maps_bug &; cat /proc/$!/maps; fg Without this patch, mmap() will hang and the command will never complete. This code was incorrectly adapted from the MMU implementation, which at the time released the lock in m_next() before returning the last entry. The MMU implementation has diverged further from the no-MMU version since then, so this patch brings their locking and error handling into sync, fixing the bug and hopefully avoiding similar issues in the future. Link: https://lkml.kernel.org/r/20230914163019.4050530-2-ben.wolsieffer@hefring.com Fixes: 47fecca15c09 ("fs/proc/task_nommu.c: don't use priv->task->mm") Signed-off-by: Ben Wolsieffer Acked-by: Oleg Nesterov Cc: Giulio Benetti Cc: Greg Ungerer Cc: Signed-off-by: Andrew Morton --- fs/proc/task_nommu.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a8ac0dd8041e..bc2e843f4810 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -192,11 +192,16 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } @@ -205,23 +210,21 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (vma) return vma; - mmap_read_unlock(mm); - mmput(mm); return NULL; } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(_vml)) { - mmap_read_unlock(priv->mm); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } static void *m_next(struct seq_file *m, void *_p, loff_t *pos) -- cgit From c8be03806738c86521dbf1e0503bc90855fb99a3 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Thu, 14 Sep 2023 21:47:41 +0800 Subject: filemap: add filemap_map_order0_folio() to handle order0 folio Kernel test robot reported regressions for several benchmarks [1]. The regression are related with commit: de74976eb65151a2f568e477fc2e0032df5b22b4 ("filemap: add filemap_map_folio_range()") It turned out that function filemap_map_folio_range() brings these regressions when handle folio with order0. Add filemap_map_order0_folio() to handle order0 folio. The benefit come from two perspectives: - the code size is smaller (around 126 bytes) - no loop Testing showed the regressions reported by 0day [1] all are fixed: commit 9f1f5b60e76d44fa: parent commit of de74976eb65151a2 commit fbdf9263a3d7fdbd: latest mm-unstable commit commit 7fbfe2003f84686d: this fixing patch 9f1f5b60e76d44fa fbdf9263a3d7fdbd 7fbfe2003f84686d ---------------- --------------------------- --------------------------- 3843810 -21.4% 3020268 +4.6% 4018708 stress-ng.bad-altstack.ops 64061 -21.4% 50336 +4.6% 66977 stress-ng.bad-altstack.ops_per_sec 1709026 -14.4% 1462102 +2.4% 1750757 stress-ng.fork.ops 28483 -14.4% 24368 +2.4% 29179 stress-ng.fork.ops_per_sec 3685088 -53.6% 1710976 +0.5% 3702454 stress-ng.zombie.ops 56732 -65.3% 19667 +0.7% 57107 stress-ng.zombie.ops_per_sec 61874 -12.1% 54416 +0.4% 62136 vm-scalability.median 13527663 -11.7% 11942117 -0.1% 13513946 vm-scalability.throughput 4.066e+09 -11.7% 3.59e+09 -0.1% 4.061e+09 vm-scalability.workload [1]: https://lore.kernel.org/oe-lkp/72e017b9-deb6-44fa-91d6-716ee2c39cbc@intel.com/T/#m7d2bba30f75a9cee8eab07e5809abd9b3b206c84 Link: https://lkml.kernel.org/r/20230914134741.1937654-1-fengwei.yin@intel.com Fixes: de74976eb65151a2f568e477fc2e0032df5b22b4 ("filemap: add filemap_map_folio_range()") Signed-off-by: Yin Fengwei Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309111556.b2aa3d7a-oliver.sang@intel.com Cc: Feng Tang Cc: Huang Ying Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 69 ++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 582f5317ff71..4ea4387053e8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3475,13 +3475,11 @@ skip: */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, - unsigned long addr, unsigned int nr_pages) + unsigned long addr, unsigned int nr_pages, + unsigned int *mmap_miss) { vm_fault_t ret = 0; - struct vm_area_struct *vma = vmf->vma; - struct file *file = vma->vm_file; struct page *page = folio_page(folio, start); - unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); unsigned int count = 0; pte_t *old_ptep = vmf->pte; @@ -3489,8 +3487,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - if (mmap_miss > 0) - mmap_miss--; + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3525,7 +3522,35 @@ skip: } vmf->pte = old_ptep; - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + + return ret; +} + +static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, + struct folio *folio, unsigned long addr, + unsigned int *mmap_miss) +{ + vm_fault_t ret = 0; + struct page *page = &folio->page; + + if (PageHWPoison(page)) + return ret; + + (*mmap_miss)++; + + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit + * the fault-around logic. + */ + if (!pte_none(ptep_get(vmf->pte))) + return ret; + + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + set_pte_range(vmf, folio, page, 1, addr); + folio_ref_inc(folio); return ret; } @@ -3541,7 +3566,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; - int nr_pages = 0; + unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3569,25 +3594,27 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, end = folio->index + folio_nr_pages(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; - /* - * NOTE: If there're PTE markers, we'll leave them to be - * handled in the specific fault path, and it'll prohibit the - * fault-around logic. - */ - if (!pte_none(ptep_get(vmf->pte))) - goto unlock; - - ret |= filemap_map_folio_range(vmf, folio, - xas.xa_index - folio->index, addr, nr_pages); + if (!folio_test_large(folio)) + ret |= filemap_map_order0_folio(vmf, + folio, addr, &mmap_miss); + else + ret |= filemap_map_folio_range(vmf, folio, + xas.xa_index - folio->index, addr, + nr_pages, &mmap_miss); -unlock: folio_unlock(folio); folio_put(folio); - folio = next_uptodate_folio(&xas, mapping, end_pgoff); - } while (folio); + } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); + + mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); + if (mmap_miss >= mmap_miss_saved) + WRITE_ONCE(file->f_ra.mmap_miss, 0); + else + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); + return ret; } EXPORT_SYMBOL(filemap_map_pages); -- cgit From fe4419801617514765974f3e796269bc512ad146 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Fri, 15 Sep 2023 12:00:56 -0400 Subject: proc: nommu: fix empty /proc//maps On no-MMU, /proc//maps reads as an empty file. This happens because find_vma(mm, 0) always returns NULL (assuming no vma actually contains the zero address, which is normally the case). To fix this bug and improve the maintainability in the future, this patch makes the no-MMU implementation as similar as possible to the MMU implementation. The only remaining differences are the lack of hold/release_task_mempolicy and the extra code to shoehorn the gate vma into the iterator. This has been tested on top of 6.5.3 on an STM32F746. Link: https://lkml.kernel.org/r/20230915160055.971059-2-ben.wolsieffer@hefring.com Fixes: 0c563f148043 ("proc: remove VMA rbtree use from nommu") Signed-off-by: Ben Wolsieffer Cc: Davidlohr Bueso Cc: Giulio Benetti Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleg Nesterov Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/proc/internal.h | 2 -- fs/proc/task_nommu.c | 37 ++++++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0..9a8f32f21ff5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -289,9 +289,7 @@ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; -#ifdef CONFIG_MMU struct vma_iterator iter; -#endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index bc2e843f4810..7cebd397cc26 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -175,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p) return nommu_vma_show(m, _p); } -static void *m_start(struct seq_file *m, loff_t *pos) +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned long addr = *pos; - /* See m_next(). Zero at the start or after lseek. */ - if (addr == -1UL) + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) return NULL; /* pin the task and mm whilst we play with them */ @@ -205,12 +218,9 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EINTR); } - /* start the next element from addr */ - vma = find_vma(mm, addr); - if (vma) - return vma; + vma_iter_init(&priv->iter, mm, last_addr); - return NULL; + return proc_get_vma(priv, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -227,12 +237,9 @@ static void m_stop(struct seq_file *m, void *v) priv->task = NULL; } -static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) { - struct vm_area_struct *vma = _p; - - *pos = vma->vm_end; - return find_vma(vma->vm_mm, vma->vm_end); + return proc_get_vma(m->private, ppos); } static const struct seq_operations proc_pid_maps_ops = { -- cgit