From 45cd0faae3715e305bc46e23b34c5ed4d185ceb8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Aug 2018 15:56:02 +0300 Subject: vfs: add the fadvise() file operation This is going to be used by overlayfs and possibly useful for other filesystems. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- mm/fadvise.c | 78 +++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 2d8376e3c640..2f59bac1cb77 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -27,9 +27,9 @@ * deactivate the pages and clear PG_Referenced. */ -int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +static int generic_fadvise(struct file *file, loff_t offset, loff_t len, + int advice) { - struct fd f = fdget(fd); struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; @@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; - int ret = 0; - if (!f.file) - return -EBADF; + inode = file_inode(file); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; - inode = file_inode(f.file); - if (S_ISFIFO(inode->i_mode)) { - ret = -ESPIPE; - goto out; - } - - mapping = f.file->f_mapping; - if (!mapping || len < 0) { - ret = -EINVAL; - goto out; - } + mapping = file->f_mapping; + if (!mapping || len < 0) + return -EINVAL; bdi = inode_to_bdi(mapping->host); @@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* no bad return value, but ignore advice */ break; default: - ret = -EINVAL; + return -EINVAL; } - goto out; + return 0; } /* @@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - f.file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&f.file->f_lock); - f.file->f_mode |= FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - f.file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ @@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) * Ignore return value because fadvise() shall return * success even if filesystem can't retrieve a hint, */ - force_page_cache_readahead(mapping, f.file, start_index, - nrpages); + force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: break; @@ -183,9 +174,30 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) } break; default: - ret = -EINVAL; + return -EINVAL; } -out: + return 0; +} + +int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + if (file->f_op->fadvise) + return file->f_op->fadvise(file, offset, len, advice); + + return generic_fadvise(file, offset, len, advice); +} +EXPORT_SYMBOL(vfs_fadvise); + +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +{ + struct fd f = fdget(fd); + int ret; + + if (!f.file) + return -EBADF; + + ret = vfs_fadvise(f.file, offset, len, advice); + fdput(f); return ret; } -- cgit From 3d8f7615319b2bca87a4815e13787439e3339a93 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 29 Aug 2018 08:41:29 +0300 Subject: vfs: implement readahead(2) using POSIX_FADV_WILLNEED The implementation of readahead(2) syscall is identical to that of fadvise64(POSIX_FADV_WILLNEED) with a few exceptions: 1. readahead(2) returns -EINVAL for !mapping->a_ops and fadvise64() ignores the request and returns 0. 2. fadvise64() checks for integer overflow corner case 3. fadvise64() calls the optional filesystem fadvise() file operation Unite the two implementations by calling vfs_fadvise() from readahead(2) syscall. Check the !mapping->a_ops in readahead(2) syscall to preserve documented syscall ABI behaviour. Suggested-by: Miklos Szeredi Fixes: d1d04ef8572b ("ovl: stack file ops") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- mm/Makefile | 3 +-- mm/fadvise.c | 3 +++ mm/readahead.c | 45 +++++++++++++++++---------------------------- 3 files changed, 21 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 8716bdabe1e6..26ef77a3883b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ @@ -49,7 +49,6 @@ else obj-y += bootmem.o endif -obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif diff --git a/mm/fadvise.c b/mm/fadvise.c index 2f59bac1cb77..467bcd032037 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -188,6 +188,8 @@ int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) } EXPORT_SYMBOL(vfs_fadvise); +#ifdef CONFIG_ADVISE_SYSCALLS + int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct fd f = fdget(fd); @@ -215,3 +217,4 @@ SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) } #endif +#endif diff --git a/mm/readahead.c b/mm/readahead.c index a59ea70527b9..4e630143a0ba 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "internal.h" @@ -575,24 +576,6 @@ page_cache_async_readahead(struct address_space *mapping, } EXPORT_SYMBOL_GPL(page_cache_async_readahead); -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops) - return -EINVAL; - - /* - * Readahead doesn't make sense for DAX inodes, but we don't want it - * to report a failure either. Instead, we just return success and - * don't do any work. - */ - if (dax_mapping(mapping)) - return 0; - - return force_page_cache_readahead(mapping, filp, index, nr); -} - ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; @@ -600,16 +583,22 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count) ret = -EBADF; f = fdget(fd); - if (f.file) { - if (f.file->f_mode & FMODE_READ) { - struct address_space *mapping = f.file->f_mapping; - pgoff_t start = offset >> PAGE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, f.file, start, len); - } - fdput(f); - } + if (!f.file || !(f.file->f_mode & FMODE_READ)) + goto out; + + /* + * The readahead() syscall is intended to run only on files + * that can execute readahead. If readahead is not possible + * on this file, then we must return -EINVAL. + */ + ret = -EINVAL; + if (!f.file->f_mapping || !f.file->f_mapping->a_ops || + !S_ISREG(file_inode(f.file)->i_mode)) + goto out; + + ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); +out: + fdput(f); return ret; } -- cgit From 9da3f2b74054406f87dff7101a569217ffceb29b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 28 Aug 2018 22:14:20 +0200 Subject: x86/fault: BUG() when uaccess helpers fault on kernel addresses There have been multiple kernel vulnerabilities that permitted userspace to pass completely unchecked pointers through to userspace accessors: - the waitid() bug - commit 96ca579a1ecc ("waitid(): Add missing access_ok() checks") - the sg/bsg read/write APIs - the infiniband read/write APIs These don't happen all that often, but when they do happen, it is hard to test for them properly; and it is probably also hard to discover them with fuzzing. Even when an unmapped kernel address is supplied to such buggy code, it just returns -EFAULT instead of doing a proper BUG() or at least WARN(). Try to make such misbehaving code a bit more visible by refusing to do a fixup in the pagefault handler code when a userspace accessor causes a #PF on a kernel address and the current context isn't whitelisted. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Tested-by: Kees Cook Cc: Andy Lutomirski Cc: kernel-hardening@lists.openwall.com Cc: dvyukov@google.com Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20180828201421.157735-7-jannh@google.com --- mm/maccess.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/maccess.c b/mm/maccess.c index ec00be51a24f..f3416632e5a4 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -30,8 +30,10 @@ long __probe_kernel_read(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); + current->kernel_uaccess_faults_ok++; ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, size); + current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -58,7 +60,9 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); + current->kernel_uaccess_faults_ok++; ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); + current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -94,11 +98,13 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) set_fs(KERNEL_DS); pagefault_disable(); + current->kernel_uaccess_faults_ok++; do { ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); + current->kernel_uaccess_faults_ok--; dst[-1] = '\0'; pagefault_enable(); set_fs(old_fs); -- cgit From 7a9cdebdcc17e426fb5287e4a82db1dfe86339b2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 12 Sep 2018 23:57:48 -1000 Subject: mm: get rid of vmacache_flush_all() entirely Jann Horn points out that the vmacache_flush_all() function is not only potentially expensive, it's buggy too. It also happens to be entirely unnecessary, because the sequence number overflow case can be avoided by simply making the sequence number be 64-bit. That doesn't even grow the data structures in question, because the other adjacent fields are already 64-bit. So simplify the whole thing by just making the sequence number overflow case go away entirely, which gets rid of all the complications and makes the code faster too. Win-win. [ Oleg Nesterov points out that the VMACACHE_FULL_FLUSHES statistics also just goes away entirely with this ] Reported-by: Jann Horn Suggested-by: Will Deacon Acked-by: Davidlohr Bueso Cc: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/debug.c | 4 ++-- mm/vmacache.c | 38 -------------------------------------- 2 files changed, 2 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 38c926520c97..bd10aad8539a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n" + pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif @@ -142,7 +142,7 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, + mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif diff --git a/mm/vmacache.c b/mm/vmacache.c index ea517bef7dc5..cdc32a3b02fa 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -19,44 +19,6 @@ #endif #define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) -/* - * Flush vma caches for threads that share a given mm. - * - * The operation is safe because the caller holds the mmap_sem - * exclusively and other threads accessing the vma cache will - * have mmap_sem held at least for read, so no extra locking - * is required to maintain the vma cache. - */ -void vmacache_flush_all(struct mm_struct *mm) -{ - struct task_struct *g, *p; - - count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); - - /* - * Single threaded tasks need not iterate the entire - * list of process. We can avoid the flushing as well - * since the mm's seqnum was increased and don't have - * to worry about other threads' seqnum. Current's - * flush will occur upon the next lookup. - */ - if (atomic_read(&mm->mm_users) == 1) - return; - - rcu_read_lock(); - for_each_process_thread(g, p) { - /* - * Only flush the vmacache pointers as the - * mm seqnum is already set and curr's will - * be set upon invalidation when the next - * lookup is done. - */ - if (mm == p->mm) - vmacache_flush(p); - } - rcu_read_unlock(); -} - /* * This task may be accessing a foreign mm via (for example) * get_user_pages()->find_vma(). The vmacache is task-local and this -- cgit From 889c695d419f19e5db52592dafbaf26143c36d1f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 20 Sep 2018 12:22:30 -0700 Subject: mm: disable deferred struct page for 32-bit arches Deferred struct page init is needed only on systems with large amount of physical memory to improve boot performance. 32-bit systems do not benefit from this feature. Jiri reported a problem where deferred struct pages do not work well with x86-32: [ 0.035162] Dentry cache hash table entries: 131072 (order: 7, 524288 bytes) [ 0.035725] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes) [ 0.036269] Initializing CPU#0 [ 0.036513] Initializing HighMem for node 0 (00036ffe:0007ffe0) [ 0.038459] page:f6780000 is uninitialized and poisoned [ 0.038460] raw: ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff [ 0.039509] page dumped because: VM_BUG_ON_PAGE(1 && PageCompound(page)) [ 0.040038] ------------[ cut here ]------------ [ 0.040399] kernel BUG at include/linux/page-flags.h:293! [ 0.040823] invalid opcode: 0000 [#1] SMP PTI [ 0.041166] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc1_pt_jiri #9 [ 0.041694] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014 [ 0.042496] EIP: free_highmem_page+0x64/0x80 [ 0.042839] Code: 13 46 d8 c1 e8 18 5d 83 e0 03 8d 04 c0 c1 e0 06 ff 80 ec 5f 44 d8 c3 8d b4 26 00 00 00 00 ba 08 65 28 d8 89 d8 e8 fc 71 02 00 <0f> 0b 8d 76 00 8d bc 27 00 00 00 00 ba d0 b1 26 d8 89 d8 e8 e4 71 [ 0.044338] EAX: 0000003c EBX: f6780000 ECX: 00000000 EDX: d856cbe8 [ 0.044868] ESI: 0007ffe0 EDI: d838df20 EBP: d838df00 ESP: d838defc [ 0.045372] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210086 [ 0.045913] CR0: 80050033 CR2: 00000000 CR3: 18556000 CR4: 00040690 [ 0.046413] DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 [ 0.046913] DR6: fffe0ff0 DR7: 00000400 [ 0.047220] Call Trace: [ 0.047419] add_highpages_with_active_regions+0xbd/0x10d [ 0.047854] set_highmem_pages_init+0x5b/0x71 [ 0.048202] mem_init+0x2b/0x1e8 [ 0.048460] start_kernel+0x1d2/0x425 [ 0.048757] i386_start_kernel+0x93/0x97 [ 0.049073] startup_32_smp+0x164/0x168 [ 0.049379] Modules linked in: [ 0.049626] ---[ end trace 337949378db0abbb ]--- We free highmem pages before their struct pages are initialized: mem_init() set_highmem_pages_init() add_highpages_with_active_regions() free_highmem_page() .. Access uninitialized struct page here.. Because there is no reason to have this feature on 32-bit systems, just disable it. Link: http://lkml.kernel.org/r/20180831150506.31246-1-pavel.tatashin@microsoft.com Fixes: 2e3ca40f03bb ("mm: relax deferred struct page requirements") Signed-off-by: Pavel Tatashin Reported-by: Jiri Slaby Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index a550635ea5c3..de64ea658716 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -637,6 +637,7 @@ config DEFERRED_STRUCT_PAGE_INIT depends on NO_BOOTMEM depends on SPARSEMEM depends on !NEED_PER_CPU_KM + depends on 64BIT help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable -- cgit From b45d71fb89ab8adfe727b9d0ee188ed58582a647 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 20 Sep 2018 12:22:39 -0700 Subject: mm: shmem.c: Correctly annotate new inodes for lockdep Directories and inodes don't necessarily need to be in the same lockdep class. For ex, hugetlbfs splits them out too to prevent false positives in lockdep. Annotate correctly after new inode creation. If its a directory inode, it will be put into a different class. This should fix a lockdep splat reported by syzbot: > ====================================================== > WARNING: possible circular locking dependency detected > 4.18.0-rc8-next-20180810+ #36 Not tainted > ------------------------------------------------------ > syz-executor900/4483 is trying to acquire lock: > 00000000d2bfc8fe (&sb->s_type->i_mutex_key#9){++++}, at: inode_lock > include/linux/fs.h:765 [inline] > 00000000d2bfc8fe (&sb->s_type->i_mutex_key#9){++++}, at: > shmem_fallocate+0x18b/0x12e0 mm/shmem.c:2602 > > but task is already holding lock: > 0000000025208078 (ashmem_mutex){+.+.}, at: ashmem_shrink_scan+0xb4/0x630 > drivers/staging/android/ashmem.c:448 > > which lock already depends on the new lock. > > -> #2 (ashmem_mutex){+.+.}: > __mutex_lock_common kernel/locking/mutex.c:925 [inline] > __mutex_lock+0x171/0x1700 kernel/locking/mutex.c:1073 > mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1088 > ashmem_mmap+0x55/0x520 drivers/staging/android/ashmem.c:361 > call_mmap include/linux/fs.h:1844 [inline] > mmap_region+0xf27/0x1c50 mm/mmap.c:1762 > do_mmap+0xa10/0x1220 mm/mmap.c:1535 > do_mmap_pgoff include/linux/mm.h:2298 [inline] > vm_mmap_pgoff+0x213/0x2c0 mm/util.c:357 > ksys_mmap_pgoff+0x4da/0x660 mm/mmap.c:1585 > __do_sys_mmap arch/x86/kernel/sys_x86_64.c:100 [inline] > __se_sys_mmap arch/x86/kernel/sys_x86_64.c:91 [inline] > __x64_sys_mmap+0xe9/0x1b0 arch/x86/kernel/sys_x86_64.c:91 > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > -> #1 (&mm->mmap_sem){++++}: > __might_fault+0x155/0x1e0 mm/memory.c:4568 > _copy_to_user+0x30/0x110 lib/usercopy.c:25 > copy_to_user include/linux/uaccess.h:155 [inline] > filldir+0x1ea/0x3a0 fs/readdir.c:196 > dir_emit_dot include/linux/fs.h:3464 [inline] > dir_emit_dots include/linux/fs.h:3475 [inline] > dcache_readdir+0x13a/0x620 fs/libfs.c:193 > iterate_dir+0x48b/0x5d0 fs/readdir.c:51 > __do_sys_getdents fs/readdir.c:231 [inline] > __se_sys_getdents fs/readdir.c:212 [inline] > __x64_sys_getdents+0x29f/0x510 fs/readdir.c:212 > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > -> #0 (&sb->s_type->i_mutex_key#9){++++}: > lock_acquire+0x1e4/0x540 kernel/locking/lockdep.c:3924 > down_write+0x8f/0x130 kernel/locking/rwsem.c:70 > inode_lock include/linux/fs.h:765 [inline] > shmem_fallocate+0x18b/0x12e0 mm/shmem.c:2602 > ashmem_shrink_scan+0x236/0x630 drivers/staging/android/ashmem.c:455 > ashmem_ioctl+0x3ae/0x13a0 drivers/staging/android/ashmem.c:797 > vfs_ioctl fs/ioctl.c:46 [inline] > file_ioctl fs/ioctl.c:501 [inline] > do_vfs_ioctl+0x1de/0x1720 fs/ioctl.c:685 > ksys_ioctl+0xa9/0xd0 fs/ioctl.c:702 > __do_sys_ioctl fs/ioctl.c:709 [inline] > __se_sys_ioctl fs/ioctl.c:707 [inline] > __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:707 > do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 > entry_SYSCALL_64_after_hwframe+0x49/0xbe > > other info that might help us debug this: > > Chain exists of: > &sb->s_type->i_mutex_key#9 --> &mm->mmap_sem --> ashmem_mutex > > Possible unsafe locking scenario: > > CPU0 CPU1 > ---- ---- > lock(ashmem_mutex); > lock(&mm->mmap_sem); > lock(ashmem_mutex); > lock(&sb->s_type->i_mutex_key#9); > > *** DEADLOCK *** > > 1 lock held by syz-executor900/4483: > #0: 0000000025208078 (ashmem_mutex){+.+.}, at: > ashmem_shrink_scan+0xb4/0x630 drivers/staging/android/ashmem.c:448 Link: http://lkml.kernel.org/r/20180821231835.166639-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reported-by: syzbot Reviewed-by: NeilBrown Suggested-by: NeilBrown Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 0376c124b043..446942677cd4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2227,6 +2227,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode mpol_shared_policy_init(&info->policy, NULL); break; } + + lockdep_annotate_inode_mutex_key(inode); } else shmem_free_inode(sb); return inode; -- cgit From 172b06c32b949759fe6313abec514bc4f15014f4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 20 Sep 2018 12:22:46 -0700 Subject: mm: slowly shrink slabs with a relatively small number of objects 9092c71bb724 ("mm: use sc->priority for slab shrink targets") changed the way that the target slab pressure is calculated and made it priority-based: delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); The problem is that on a default priority (which is 12) no pressure is applied at all, if the number of potentially reclaimable objects is less than 4096 (1<<12). This causes the last objects on slab caches of no longer used cgroups to (almost) never get reclaimed. It's obviously a waste of memory. It can be especially painful, if these stale objects are holding a reference to a dying cgroup. Slab LRU lists are reparented on memcg offlining, but corresponding objects are still holding a reference to the dying cgroup. If we don't scan these objects, the dying cgroup can't go away. Most likely, the parent cgroup hasn't any directly charged objects, only remaining objects from dying children cgroups. So it can easily hold a reference to hundreds of dying cgroups. If there are no big spikes in memory pressure, and new memory cgroups are created and destroyed periodically, this causes the number of dying cgroups grow steadily, causing a slow-ish and hard-to-detect memory "leak". It's not a real leak, as the memory can be eventually reclaimed, but it could not happen in a real life at all. I've seen hosts with a steadily climbing number of dying cgroups, which doesn't show any signs of a decline in months, despite the host is loaded with a production workload. It is an obvious waste of memory, and to prevent it, let's apply a minimal pressure even on small shrinker lists. E.g. if there are freeable objects, let's scan at least min(freeable, scan_batch) objects. This fix significantly improves a chance of a dying cgroup to be reclaimed, and together with some previous patches stops the steady growth of the dying cgroups number on some of our hosts. Link: http://lkml.kernel.org/r/20180905230759.12236-1-guro@fb.com Fixes: 9092c71bb724 ("mm: use sc->priority for slab shrink targets") Signed-off-by: Roman Gushchin Acked-by: Rik van Riel Cc: Josef Bacik Cc: Johannes Weiner Cc: Shakeel Butt Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7e7d25504651..c7ce2c161225 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -476,6 +476,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); + + /* + * Make sure we apply some minimal pressure on default priority + * even on small cgroups. Stale objects are not only consuming memory + * by themselves, but can also hold a reference to a dying cgroup, + * preventing it from being reclaimed. A dying cgroup with all + * corresponding structures like per-cpu stats and kmem caches + * can be really big, so it may lead to a significant waste of memory. + */ + delta = max_t(unsigned long long, delta, min(freeable, batch_size)); + total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", -- cgit