From a9e73998f9d705c94a8dca9687633adc0f24a19a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 13 May 2019 17:15:40 -0700 Subject: kernel/sys.c: prctl: fix false positive in validate_prctl_map() While validating new map we require the @start_data to be strictly less than @end_data, which is fine for regular applications (this is why this nit didn't trigger for that long). These members are set from executable loaders such as elf handers, still it is pretty valid to have a loadable data section with zero size in file, in such case the start_data is equal to end_data once kernel loader finishes. As a result when we're trying to restore such programs the procedure fails and the kernel returns -EINVAL. From the image dump of a program: | "mm_start_code": "0x400000", | "mm_end_code": "0x8f5fb4", | "mm_start_data": "0xf1bfb0", | "mm_end_data": "0xf1bfb0", Thus we need to change validate_prctl_map from strictly less to less or equal operator use. Link: http://lkml.kernel.org/r/20190408143554.GY1421@uranus.lan Fixes: f606b77f1a9e3 ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") Signed-off-by: Cyrill Gorcunov Cc: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..bdbfe8d37418 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); -- cgit From cefdca0a86be517bc390fc4541e3674b8e7803b0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 13 May 2019 17:16:41 -0700 Subject: userfaultfd/sysctl: add vm.unprivileged_userfaultfd Userfaultfd can be misued to make it easier to exploit existing use-after-free (and similar) bugs that might otherwise only make a short window or race condition available. By using userfaultfd to stall a kernel thread, a malicious program can keep some state that it wrote, stable for an extended period, which it can then access using an existing exploit. While it doesn't cause the exploit itself, and while it's not the only thing that can stall a kernel thread when accessing a memory location, it's one of the few that never needs privilege. We can add a flag, allowing userfaultfd to be restricted, so that in general it won't be useable by arbitrary user programs, but in environments that require userfaultfd it can be turned back on. Add a global sysctl knob "vm.unprivileged_userfaultfd" to control whether userfaultfd is allowed by unprivileged users. When this is set to zero, only privileged users (root user, or users with the CAP_SYS_PTRACE capability) will be able to use the userfaultfd syscalls. Andrea said: : The only difference between the bpf sysctl and the userfaultfd sysctl : this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability : requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement, : because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE : already if it's doing other kind of tracking on processes runtime, in : addition of userfaultfd. In other words both syscalls works only for : root, when the two sysctl are opt-in set to 1. [dgilbert@redhat.com: changelog additions] [akpm@linux-foundation.org: documentation tweak, per Mike] Link: http://lkml.kernel.org/r/20190319030722.12441-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Paolo Bonzini Cc: Hugh Dickins Cc: Luis Chamberlain Cc: Maxime Coquelin Cc: Maya Gokhale Cc: Jerome Glisse Cc: Pavel Emelyanov Cc: Johannes Weiner Cc: Martin Cracauer Cc: Denis Plotnikov Cc: Marty McFadden Cc: Mike Kravetz Cc: Kees Cook Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 599510a3355e..ba158f61aab4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:11 -0700 Subject: mm/gup: change GUP fast to use flags rather than a write 'bool' To facilitate additional options to get_user_pages_fast() change the singular write parameter to be gup_flags. This patch does not change any functionality. New functionality will follow in subsequent patches. Some of the get_user_pages_fast() call sites were unchanged because they already passed FOLL_WRITE or 0 for the write parameter. NOTE: It was suggested to change the ordering of the get_user_pages_fast() arguments to ensure that callers were converted. This breaks the current GUP call site convention of having the returned pages be the final parameter. So the suggestion was rejected. Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Mike Marshall Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. -- cgit From 6f4f13e8d9e27cefd2cd88dd4fd80aa6d68b9131 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:49 -0700 Subject: mm/mmu_notifier: contextual information for event triggering invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<---------------------------------------------------------------------- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } ---------------------------------------------------------------------->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Link: http://lkml.kernel.org/r/20190326164747.24405-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4ca7364c956d..e34b699f3865 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); -- cgit From 7269f999934b289da7972e975b781417b07ef836 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:53 -0700 Subject: mm/mmu_notifier: use correct mmu_notifier events for each invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This updates each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Link: http://lkml.kernel.org/r/20190326164747.24405-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e34b699f3865..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); -- cgit From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:26 -0700 Subject: mm, memory_hotplug: provide a more generic restrictions for memory hotplug arch_add_memory, __add_pages take a want_memblock which controls whether the newly added memory should get the sysfs memblock user API (e.g. ZONE_DEVICE users do not want/need this interface). Some callers even want to control where do we allocate the memmap from by configuring altmap. Add a more generic hotplug context for arch_add_memory and __add_pages. struct mhp_restrictions contains flags which contains additional features to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and altmap for alternative memmap allocator. This patch shouldn't introduce any functional change. [akpm@linux-foundation.org: build fix] Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..4e59d29245f4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { -- cgit From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:22:59 -0700 Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out Most architectures do not need the memblock memory after the page allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the arch Kconfig. Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the logic makes it clear which architectures actually use memblock after system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK to the architectures that are still missing that option. Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michael Ellerman (powerpc) Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); -- cgit From 640be2d1ffbc1946f1547eb89b5005ed7542de99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:23:23 -0700 Subject: kernel/memremap.c: remove the unused device_private_entry_fault() export This export has been entirely unused since it was added more than 1 1/2 years ago. Link: http://lkml.kernel.org/r/20190429115535.12793-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 4e59d29245f4..1490e63f69a9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, */ return devmem->page_fault(vma, addr, page, flags, pmdp); } -EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ static void pgmap_array_delete(struct resource *res) -- cgit