From 2c9feeaedfe1f34c4e327cb6f9b8b90052edced1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Nov 2021 18:31:44 -0800 Subject: proc/vmcore: let pfn_is_ram() return a bool The callback should deal with errors internally, it doesn't make sense to expose these via pfn_is_ram(). We'll rework the callbacks next. Right now we consider errors as if "it's RAM"; no functional change. Link: https://lkml.kernel.org/r/20211005121430.30136-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Juergen Gross Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc/vmcore.c') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9a15334da208..a9bd80ab670e 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -84,11 +84,11 @@ void unregister_oldmem_pfn_is_ram(void) } EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); -static int pfn_is_ram(unsigned long pfn) +static bool pfn_is_ram(unsigned long pfn) { int (*fn)(unsigned long pfn); /* pfn is ram unless fn() checks pagetype */ - int ret = 1; + bool ret = true; /* * Ask hypervisor if the pfn is really ram. @@ -97,7 +97,7 @@ static int pfn_is_ram(unsigned long pfn) */ fn = oldmem_pfn_is_ram; if (fn) - ret = fn(pfn); + ret = !!fn(pfn); return ret; } @@ -124,7 +124,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) + if (!pfn_is_ram(pfn)) memset(buf, 0, nr_bytes); else { if (encrypted) -- cgit From cc5f2704c93456e6f1c671c5cf7b582a55df5484 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Nov 2021 18:31:48 -0800 Subject: proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks Let's support multiple registered callbacks, making sure that registering vmcore callbacks cannot fail. Make the callback return a bool instead of an int, handling how to deal with errors internally. Drop unused HAVE_OLDMEM_PFN_IS_RAM. We soon want to make use of this infrastructure from other drivers: virtio-mem, registering one callback for each virtio-mem device, to prevent reading unplugged virtio-mem memory. Handle it via a generic vmcore_cb structure, prepared for future extensions: for example, once we support virtio-mem on s390x where the vmcore is completely constructed in the second kernel, we want to detect and add plugged virtio-mem memory ranges to the vmcore in order for them to get dumped properly. Handle corner cases that are unexpected and shouldn't happen in sane setups: registering a callback after the vmcore has already been opened (warn only) and unregistering a callback after the vmcore has already been opened (warn and essentially read only zeroes from that point on). Link: https://lkml.kernel.org/r/20211005121430.30136-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Juergen Gross Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 99 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 30 deletions(-) (limited to 'fs/proc/vmcore.c') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a9bd80ab670e..7a04b2eca287 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -/* - * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error - * The called function has to take care of module refcounting. - */ -static int (*oldmem_pfn_is_ram)(unsigned long pfn); - -int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +static DECLARE_RWSEM(vmcore_cb_rwsem); +/* List of registered vmcore callbacks. */ +static LIST_HEAD(vmcore_cb_list); +/* Whether we had a surprise unregistration of a callback. */ +static bool vmcore_cb_unstable; +/* Whether the vmcore has been opened once. */ +static bool vmcore_opened; + +void register_vmcore_cb(struct vmcore_cb *cb) { - if (oldmem_pfn_is_ram) - return -EBUSY; - oldmem_pfn_is_ram = fn; - return 0; + down_write(&vmcore_cb_rwsem); + INIT_LIST_HEAD(&cb->next); + list_add_tail(&cb->next, &vmcore_cb_list); + /* + * Registering a vmcore callback after the vmcore was opened is + * very unusual (e.g., manual driver loading). + */ + if (vmcore_opened) + pr_warn_once("Unexpected vmcore callback registration\n"); + up_write(&vmcore_cb_rwsem); } -EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(register_vmcore_cb); -void unregister_oldmem_pfn_is_ram(void) +void unregister_vmcore_cb(struct vmcore_cb *cb) { - oldmem_pfn_is_ram = NULL; - wmb(); + down_write(&vmcore_cb_rwsem); + list_del(&cb->next); + /* + * Unregistering a vmcore callback after the vmcore was opened is + * very unusual (e.g., forced driver removal), but we cannot stop + * unregistering. + */ + if (vmcore_opened) { + pr_warn_once("Unexpected vmcore callback unregistration\n"); + vmcore_cb_unstable = true; + } + up_write(&vmcore_cb_rwsem); } -EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(unregister_vmcore_cb); static bool pfn_is_ram(unsigned long pfn) { - int (*fn)(unsigned long pfn); - /* pfn is ram unless fn() checks pagetype */ + struct vmcore_cb *cb; bool ret = true; - /* - * Ask hypervisor if the pfn is really ram. - * A ballooned page contains no data and reading from such a page - * will cause high load in the hypervisor. - */ - fn = oldmem_pfn_is_ram; - if (fn) - ret = !!fn(pfn); + lockdep_assert_held_read(&vmcore_cb_rwsem); + if (unlikely(vmcore_cb_unstable)) + return false; + + list_for_each_entry(cb, &vmcore_cb_list, next) { + if (unlikely(!cb->pfn_is_ram)) + continue; + ret = cb->pfn_is_ram(cb, pfn); + if (!ret) + break; + } return ret; } +static int open_vmcore(struct inode *inode, struct file *file) +{ + down_read(&vmcore_cb_rwsem); + vmcore_opened = true; + up_read(&vmcore_cb_rwsem); + + return 0; +} + /* Reads a page from the oldmem device from given offset. */ ssize_t read_from_oldmem(char *buf, size_t count, u64 *ppos, int userbuf, @@ -117,6 +146,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); + down_read(&vmcore_cb_rwsem); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -136,8 +166,10 @@ ssize_t read_from_oldmem(char *buf, size_t count, tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - if (tmp < 0) + if (tmp < 0) { + up_read(&vmcore_cb_rwsem); return tmp; + } } *ppos += nr_bytes; count -= nr_bytes; @@ -147,6 +179,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = 0; } while (count); + up_read(&vmcore_cb_rwsem); return read; } @@ -537,14 +570,19 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + int ret; + /* * Check if oldmem_pfn_is_ram was registered to avoid * looping over all pages without a reason. */ - if (oldmem_pfn_is_ram) - return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + down_read(&vmcore_cb_rwsem); + if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) + ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else - return remap_oldmem_pfn_range(vma, from, pfn, size, prot); + ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); + up_read(&vmcore_cb_rwsem); + return ret; } static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) @@ -668,6 +706,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) #endif static const struct proc_ops vmcore_proc_ops = { + .proc_open = open_vmcore, .proc_read = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, -- cgit From c1e63117711977cc4295b2ce73de29dd17066c82 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 19 Nov 2021 16:43:58 -0800 Subject: proc/vmcore: fix clearing user buffer by properly using clear_user() To clear a user buffer we cannot simply use memset, we have to use clear_user(). With a virtio-mem device that registers a vmcore_cb and has some logically unplugged memory inside an added Linux memory block, I can easily trigger a BUG by copying the vmcore via "cp": systemd[1]: Starting Kdump Vmcore Save Service... kdump[420]: Kdump is using the default log level(3). kdump[453]: saving to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[458]: saving vmcore-dmesg.txt to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[465]: saving vmcore-dmesg.txt complete kdump[467]: saving vmcore BUG: unable to handle page fault for address: 00007f2374e01000 #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 7a523067 P4D 7a523067 PUD 7a528067 PMD 7a525067 PTE 800000007048f867 Oops: 0003 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 468 Comm: cp Not tainted 5.15.0+ #6 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-27-g64f37cc530f1-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_from_oldmem.part.0.cold+0x1d/0x86 Code: ff ff ff e8 05 ff fe ff e9 b9 e9 7f ff 48 89 de 48 c7 c7 38 3b 60 82 e8 f1 fe fe ff 83 fd 08 72 3c 49 8d 7d 08 4c 89 e9 89 e8 <49> c7 45 00 00 00 00 00 49 c7 44 05 f8 00 00 00 00 48 83 e7 f81 RSP: 0018:ffffc9000073be08 EFLAGS: 00010212 RAX: 0000000000001000 RBX: 00000000002fd000 RCX: 00007f2374e01000 RDX: 0000000000000001 RSI: 00000000ffffdfff RDI: 00007f2374e01008 RBP: 0000000000001000 R08: 0000000000000000 R09: ffffc9000073bc50 R10: ffffc9000073bc48 R11: ffffffff829461a8 R12: 000000000000f000 R13: 00007f2374e01000 R14: 0000000000000000 R15: ffff88807bd421e8 FS: 00007f2374e12140(0000) GS:ffff88807f000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2374e01000 CR3: 000000007a4aa000 CR4: 0000000000350eb0 Call Trace: read_vmcore+0x236/0x2c0 proc_reg_read+0x55/0xa0 vfs_read+0x95/0x190 ksys_read+0x4f/0xc0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Some x86-64 CPUs have a CPU feature called "Supervisor Mode Access Prevention (SMAP)", which is used to detect wrong access from the kernel to user buffers like this: SMAP triggers a permissions violation on wrong access. In the x86-64 variant of clear_user(), SMAP is properly handled via clac()+stac(). To fix, properly use clear_user() when we're dealing with a user buffer. Link: https://lkml.kernel.org/r/20211112092750.6921-1-david@redhat.com Fixes: 997c136f518c ("fs/proc/vmcore.c: add hook to read_from_oldmem() to check for non-ram pages") Signed-off-by: David Hildenbrand Acked-by: Baoquan He Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Philipp Rudo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/proc/vmcore.c') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 30a3b66f475a..509f85148fee 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -154,9 +154,13 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (!pfn_is_ram(pfn)) - memset(buf, 0, nr_bytes); - else { + if (!pfn_is_ram(pfn)) { + tmp = 0; + if (!userbuf) + memset(buf, 0, nr_bytes); + else if (clear_user(buf, nr_bytes)) + tmp = -EFAULT; + } else { if (encrypted) tmp = copy_oldmem_page_encrypted(pfn, buf, nr_bytes, @@ -165,12 +169,12 @@ ssize_t read_from_oldmem(char *buf, size_t count, else tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - - if (tmp < 0) { - up_read(&vmcore_cb_rwsem); - return tmp; - } } + if (tmp < 0) { + up_read(&vmcore_cb_rwsem); + return tmp; + } + *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes; -- cgit