From e1fb4a0864958fac2fb1b23f9f4562a9f90e3e8f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 17 Aug 2018 15:43:40 -0700 Subject: dax: remove VM_MIXEDMAP for fsdax and device dax This patch is reworked from an earlier patch that Dan has posted: https://patchwork.kernel.org/patch/10131727/ VM_MIXEDMAP is used by dax to direct mm paths like vm_normal_page() that the memory page it is dealing with is not typical memory from the linear map. The get_user_pages_fast() path, since it does not resolve the vma, is already using {pte,pmd}_devmap() as a stand-in for VM_MIXEDMAP, so we use that as a VM_MIXEDMAP replacement in some locations. In the cases where there is no pte to consult we fallback to using vma_is_dax() to detect the VM_MIXEDMAP special case. Now that we have explicit driver pfn_t-flag opt-in/opt-out for get_user_pages() support for DAX we can stop setting VM_MIXEDMAP. This also means we no longer need to worry about safely manipulating vm_flags in a future where we support dynamically changing the dax mode of a file. DAX should also now be supported with madvise_behavior(), vma_merge(), and copy_page_range(). This patch has been tested against ndctl unit test. It has also been tested against xfstests commit: 625515d using fake pmem created by memmap and no additional issues have been observed. Link: http://lkml.kernel.org/r/152847720311.55924.16999195879201817653.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Acked-by: Dan Williams Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/file.c | 1 - fs/ext4/file.c | 2 +- fs/xfs/xfs_file.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 047c327a6b23..28b2609f25c1 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -126,7 +126,6 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP; return 0; } #else diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7f8023340eb8..69d65d49837b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -374,7 +374,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 181e9084519b..5eaef2c17293 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1169,7 +1169,7 @@ xfs_file_mmap( file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_HUGEPAGE; return 0; } -- cgit From a3fda0ffeaf0114328024aee4a9ec3b08af4b077 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:47 -0700 Subject: fs/ufs: use ktime_get_real_seconds for sb and cg timestamps get_seconds() is deprecated because of the 32-bit overflow and will be removed. All callers in ufs also truncate to a 32-bit number, so nothing changes during the conversion, but this should be harmless as the superblock and cylinder group timestamps are not visible to user space, except for checking the fs-dirty state, wich works fine across the overflow. This moves the call to get_seconds() into a new inline function, with a comment explaining the constraints, while converting it to ktime_get_real_seconds(). Link: http://lkml.kernel.org/r/20180718115017.742609-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Al Viro Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/balloc.c | 4 ++-- fs/ufs/ialloc.c | 2 +- fs/ufs/super.c | 4 ++-- fs/ufs/util.h | 14 ++++++++++++++ 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index e727ee07dbe4..075d3d9114c8 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -547,7 +547,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, /* * Block can be extended */ - ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + ucg->cg_time = ufs_get_seconds(sb); for (i = newcount; i < (uspi->s_fpb - fragoff); i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) break; @@ -639,7 +639,7 @@ cg_found: if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_alloc_fragments", "internal error, bad magic number on cg %u", cgno); - ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + ucg->cg_time = ufs_get_seconds(sb); if (count == uspi->s_fpb) { result = ufs_alloccg_block (inode, ucpi, goal, err); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 02c0a4be4212..969fd60436d3 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -89,7 +89,7 @@ void ufs_free_inode (struct inode * inode) if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number"); - ucg->cg_time = cpu_to_fs32(sb, get_seconds()); + ucg->cg_time = ufs_get_seconds(sb); is_directory = S_ISDIR(inode->i_mode); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 488088141451..a4e07e910f1b 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -698,7 +698,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); - usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) @@ -1342,7 +1342,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) */ if (*mount_flags & SB_RDONLY) { ufs_put_super_internal(sb); - usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1907be6d5808..1fd3011ea623 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -590,3 +590,17 @@ static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi, else return *(__fs32 *)p == 0; } + +static inline __fs32 ufs_get_seconds(struct super_block *sbp) +{ + time64_t now = ktime_get_real_seconds(); + + /* Signed 32-bit interpretation wraps around in 2038, which + * happens in ufs1 inode stamps but not ufs2 using 64-bits + * stamps. For superblock and blockgroup, let's assume + * unsigned 32-bit stamps, which are good until y2106. + * Wrap around rather than clamp here to make the dirty + * file system detection work in the superblock stamp. + */ + return cpu_to_fs32(sbp, lower_32_bits(now)); +} -- cgit From bcf451ecfc8d45618d13c9e4abcbbd770af20cc9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:50 -0700 Subject: fs/ntfs: use timespec64 directly for timestamp conversion Now that the VFS has been converted from timespec to timespec64 timestamps, only the conversion to/from ntfs timestamps uses 32-bit seconds. This changes that last missing piece to get the ntfs implementation y2038 safe on 32-bit architectures. Link: http://lkml.kernel.org/r/20180718115017.742609-2-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Anton Altaparmakov Cc: Al Viro Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/inode.c | 12 ++++++------ fs/ntfs/time.h | 27 +++++++++++++++------------ 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index decaf75d1cd5..bd3221cbdd95 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -667,18 +667,18 @@ static int ntfs_read_locked_inode(struct inode *vi) * mtime is the last change of the data within the file. Not changed * when only metadata is changed, e.g. a rename doesn't affect mtime. */ - vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time)); + vi->i_mtime = ntfs2utc(si->last_data_change_time); /* * ctime is the last change of the metadata of the file. This obviously * always changes, when mtime is changed. ctime can be changed on its * own, mtime is then not changed, e.g. when a file is renamed. */ - vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time)); + vi->i_ctime = ntfs2utc(si->last_mft_change_time); /* * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. */ - vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time)); + vi->i_atime = ntfs2utc(si->last_access_time); /* Find the attribute list attribute if present. */ ntfs_attr_reinit_search_ctx(ctx); @@ -2997,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si = (STANDARD_INFORMATION*)((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); /* Update the access times if they have changed. */ - nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime)); + nt = utc2ntfs(vi->i_mtime); if (si->last_data_change_time != nt) { ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3006,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_data_change_time = nt; modified = true; } - nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime)); + nt = utc2ntfs(vi->i_ctime); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3015,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_mft_change_time = nt; modified = true; } - nt = utc2ntfs(timespec64_to_timespec(vi->i_atime)); + nt = utc2ntfs(vi->i_atime); if (si->last_access_time != nt) { ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h index 01233989d5d1..24cd719f1fd2 100644 --- a/fs/ntfs/time.h +++ b/fs/ntfs/time.h @@ -36,16 +36,16 @@ * Convert the Linux UTC time @ts to its corresponding NTFS time and return * that in little endian format. * - * Linux stores time in a struct timespec consisting of a time_t (long at - * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second - * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of - * 1-nano-second intervals since the value of tv_sec. + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. * * NTFS uses Microsoft's standard time format which is stored in a s64 and is * measured as the number of 100-nano-second intervals since 1st January 1601, * 00:00:00 UTC. */ -static inline sle64 utc2ntfs(const struct timespec ts) +static inline sle64 utc2ntfs(const struct timespec64 ts) { /* * Convert the seconds to 100ns intervals, add the nano-seconds @@ -63,7 +63,10 @@ static inline sle64 utc2ntfs(const struct timespec ts) */ static inline sle64 get_current_ntfs_time(void) { - return utc2ntfs(current_kernel_time()); + struct timespec64 ts; + + ktime_get_coarse_real_ts64(&ts); + return utc2ntfs(ts); } /** @@ -73,18 +76,18 @@ static inline sle64 get_current_ntfs_time(void) * Convert the little endian NTFS time @time to its corresponding Linux UTC * time and return that in cpu format. * - * Linux stores time in a struct timespec consisting of a time_t (long at - * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second - * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of - * 1-nano-second intervals since the value of tv_sec. + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. * * NTFS uses Microsoft's standard time format which is stored in a s64 and is * measured as the number of 100 nano-second intervals since 1st January 1601, * 00:00:00 UTC. */ -static inline struct timespec ntfs2utc(const sle64 time) +static inline struct timespec64 ntfs2utc(const sle64 time) { - struct timespec ts; + struct timespec64 ts; /* Subtract the NTFS time offset. */ u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); -- cgit From f08957d0ffe91f346c47cef95139c54aa7275cfe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 15:43:54 -0700 Subject: fs/hpfs: extend gmt_to_local() conversion to 64-bit times The VFS timestamps are all 64-bit now, the only missing piece for hpfs is the internal conversion function. One interesting bit about hpfs is that it can already deal with moving the 136 year window of its timestamps to support a much wider range than other file systems with 32-bit timestamps. It also treats the timestamps as 'unsigned' on 64-bit architectures (but signed on 32-bit, because time_t always around to negative numbers in 2038). Changing the conversion to use time64_t makes 32-bit architectures behave the same way as 64-bit. For completeness, this also adds a clamp_t call for each conversion, so we don't wrap the timestamps but instead stay within the [0..U32_MAX] range of the on-disk timestamps. Link: http://lkml.kernel.org/r/20180718115017.742609-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Mikulas Patocka Cc: Al Viro Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hpfs/hpfs_fn.h | 13 ++++++++++--- fs/hpfs/namei.c | 12 ++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 2a153aed4c19..ab2e7cc2ff33 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -334,16 +334,23 @@ long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg); * local time (HPFS) to GMT (Unix) */ -static inline time_t local_to_gmt(struct super_block *s, time32_t t) +static inline time64_t local_to_gmt(struct super_block *s, time32_t t) { extern struct timezone sys_tz; return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift; } -static inline time32_t gmt_to_local(struct super_block *s, time_t t) +static inline time32_t gmt_to_local(struct super_block *s, time64_t t) { extern struct timezone sys_tz; - return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; + t = t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; + + return clamp_t(time64_t, t, 0, U32_MAX); +} + +static inline time32_t local_get_seconds(struct super_block *s) +{ + return gmt_to_local(s, ktime_get_real_seconds()); } /* diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index a3615e4c730d..082b7c76dd0c 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -11,7 +11,7 @@ static void hpfs_update_directory_times(struct inode *dir) { - time_t t = get_seconds(); + time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb)); if (t == dir->i_mtime.tv_sec && t == dir->i_ctime.tv_sec) return; @@ -50,7 +50,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) /*dee.archive = 0;*/ dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) goto bail2; @@ -91,7 +91,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dnode->root_dnode = 1; dnode->up = cpu_to_le32(fno); de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0); - de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + de->creation_date = de->write_date = de->read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); if (!(mode & 0222)) de->read_only = 1; de->first = de->directory = 1; /*de->hidden = de->system = 0;*/ @@ -151,7 +151,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b dee.archive = 1; dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) @@ -238,7 +238,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de dee.archive = 1; dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) @@ -314,7 +314,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy dee.archive = 1; dee.hidden = name[0] == '.'; dee.fnode = cpu_to_le32(fno); - dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds())); + dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); if (!result) -- cgit From a10dcebacdb0cf6eb29c211e99cf190cd131a16a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 Aug 2018 15:44:04 -0700 Subject: fs/ntfs/aops.c: don't disable interrupts during kmap_atomic() ntfs_end_buffer_async_read() disables interrupts around kmap_atomic(). This is a leftover from the old kmap_atomic() implementation which relied on fixed mapping slots, so the caller had to make sure that the same slot could not be reused from an interrupting context. kmap_atomic() was changed to dynamic slots long ago and commit 1ec9c5ddc17a ("include/linux/highmem.h: remove the second argument of k[un]map_atomic()") removed the slot assignements, but the callers were not checked for now redundant interrupt disabling. Remove the conditional interrupt disable. Link: http://lkml.kernel.org/r/20180611144913.gln5mklhqcrfsoom@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Cc: Anton Altaparmakov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/aops.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 3a2e509c77c5..01c770979921 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -93,13 +93,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) ofs = 0; if (file_ofs < init_size) ofs = init_size - file_ofs; - local_irq_save(flags); kaddr = kmap_atomic(page); memset(kaddr + bh_offset(bh) + ofs, 0, bh->b_size - ofs); flush_dcache_page(page); kunmap_atomic(kaddr); - local_irq_restore(flags); } } else { clear_buffer_uptodate(bh); @@ -146,13 +144,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) recs = PAGE_SIZE / rec_size; /* Should have been verified before we got here... */ BUG_ON(!recs); - local_irq_save(flags); kaddr = kmap_atomic(page); for (i = 0; i < recs; i++) post_read_mst_fixup((NTFS_RECORD*)(kaddr + i * rec_size), rec_size); kunmap_atomic(kaddr); - local_irq_restore(flags); flush_dcache_page(page); if (likely(page_uptodate && !PageError(page))) SetPageUptodate(page); -- cgit From ac4ecf968acb9e54c335f99d842d56d6b90e28fb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Aug 2018 15:44:07 -0700 Subject: ntfs: aops: remove VLA usage In the quest to remove all stack VLA usage from the kernel[1], this uses the maximum size needed on the stack and adds a sanity check for robustness: index.block_size cannot be larger than PAGE_SIZE nor less than NTFS_BLOCK_SIZE. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180626172909.41453-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/aops.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 01c770979921..8946130c87ad 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -922,7 +922,7 @@ static int ntfs_write_mst_block(struct page *page, ntfs_volume *vol = ni->vol; u8 *kaddr; unsigned int rec_size = ni->itype.index.block_size; - ntfs_inode *locked_nis[PAGE_SIZE / rec_size]; + ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; struct buffer_head *bh, *head, *tbh, *rec_start_bh; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; runlist_element *rl; @@ -931,6 +931,9 @@ static int ntfs_write_mst_block(struct page *page, bool sync, is_mft, page_is_dirty, rec_is_dirty; unsigned char bh_size_bits; + if (WARN_ON(rec_size < NTFS_BLOCK_SIZE)) + return -EINVAL; + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " "0x%lx.", vi->i_ino, ni->type, page->index); BUG_ON(!NInoNonResident(ni)); -- cgit From 2c27ce915078a5822aefb5db7bc2481664b26044 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Aug 2018 15:44:11 -0700 Subject: ntfs: decompress: remove VLA usage In the quest to remove all stack VLA usage from the kernel[1], this moves the stack buffer used during decompression to be allocated externally. The existing "dest_max_index" used in the VLA is bounded by cb_max_page. cb_max_page is bounded by max_page, and max_page is bounded by nr_pages. Since nr_pages is used for the "pages" allocation, it can similarly be used for the "completed_pages" allocation and passed into the decompression function. The error paths are updated to free the new allocation. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180626172909.41453-3-keescook@chromium.org Signed-off-by: Kees Cook Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/compress.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index fbd0090d7d0c..df7c32b5fac7 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -128,6 +128,7 @@ static inline void handle_bounds_compressed_page(struct page *page, /** * ntfs_decompress - decompress a compression block into an array of pages * @dest_pages: destination array of pages + * @completed_pages: scratch space to track completed pages * @dest_index: current index into @dest_pages (IN/OUT) * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) * @dest_max_index: maximum index into @dest_pages (IN) @@ -162,10 +163,10 @@ static inline void handle_bounds_compressed_page(struct page *page, * Note to hackers: This function may not sleep until it has finished accessing * the compression block @cb_start as it is a per-CPU buffer. */ -static int ntfs_decompress(struct page *dest_pages[], int *dest_index, - int *dest_ofs, const int dest_max_index, const int dest_max_ofs, - const int xpage, char *xpage_done, u8 *const cb_start, - const u32 cb_size, const loff_t i_size, +static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], + int *dest_index, int *dest_ofs, const int dest_max_index, + const int dest_max_ofs, const int xpage, char *xpage_done, + u8 *const cb_start, const u32 cb_size, const loff_t i_size, const s64 initialized_size) { /* @@ -190,9 +191,6 @@ static int ntfs_decompress(struct page *dest_pages[], int *dest_index, /* Variables for tag and token parsing. */ u8 tag; /* Current tag. */ int token; /* Loop counter for the eight tokens in tag. */ - - /* Need this because we can't sleep, so need two stages. */ - int completed_pages[dest_max_index - *dest_index + 1]; int nr_completed_pages = 0; /* Default error code. */ @@ -516,6 +514,7 @@ int ntfs_read_compressed_block(struct page *page) unsigned int cb_clusters, cb_max_ofs; int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; struct page **pages; + int *completed_pages; unsigned char xpage_done = 0; ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " @@ -528,14 +527,16 @@ int ntfs_read_compressed_block(struct page *page) BUG_ON(ni->name_len); pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); + completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); /* Allocate memory to store the buffer heads we need. */ bhs_size = cb_size / block_size * sizeof(struct buffer_head *); bhs = kmalloc(bhs_size, GFP_NOFS); - if (unlikely(!pages || !bhs)) { + if (unlikely(!pages || !bhs || !completed_pages)) { kfree(bhs); kfree(pages); + kfree(completed_pages); unlock_page(page); ntfs_error(vol->sb, "Failed to allocate internal buffers."); return -ENOMEM; @@ -562,6 +563,7 @@ int ntfs_read_compressed_block(struct page *page) if (xpage >= max_page) { kfree(bhs); kfree(pages); + kfree(completed_pages); zero_user(page, 0, PAGE_SIZE); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); @@ -854,10 +856,10 @@ lock_retry_remap: unsigned int prev_cur_page = cur_page; ntfs_debug("Found compressed compression block."); - err = ntfs_decompress(pages, &cur_page, &cur_ofs, - cb_max_page, cb_max_ofs, xpage, &xpage_done, - cb_pos, cb_size - (cb_pos - cb), i_size, - initialized_size); + err = ntfs_decompress(pages, completed_pages, &cur_page, + &cur_ofs, cb_max_page, cb_max_ofs, xpage, + &xpage_done, cb_pos, cb_size - (cb_pos - cb), + i_size, initialized_size); /* * We can sleep from now on, lock already dropped by * ntfs_decompress(). @@ -912,6 +914,7 @@ lock_retry_remap: /* We no longer need the list of pages. */ kfree(pages); + kfree(completed_pages); /* If we have completed the requested page, we return success. */ if (likely(xpage_done)) @@ -956,5 +959,6 @@ err_out: } } kfree(pages); + kfree(completed_pages); return -EIO; } -- cgit From ab62ef82ea49b8814f4b0e2fe61426acda793fb9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Aug 2018 15:44:14 -0700 Subject: ntfs: mft: remove VLA usage In the quest to remove all stack VLA usage from the kernel[1], this allocates the maximum size stack buffer. Existing checks already require that blocksize >= NTFS_BLOCK_SIZE and mft_record_size <= PAGE_SIZE, so max_bhs can be at most PAGE_SIZE / NTFS_BLOCK_SIZE. Sanity checks are added for robustness. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180626172909.41453-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/mft.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 32c523cf5a2d..fb14d17666c8 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -35,6 +35,8 @@ #include "mft.h" #include "ntfs.h" +#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) + /** * map_mft_record_page - map the page in which a specific mft record resides * @ni: ntfs inode whose mft record page to map @@ -469,7 +471,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, struct page *page; unsigned int blocksize = vol->sb->s_blocksize; int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[max_bhs]; + struct buffer_head *bhs[MAX_BHS]; struct buffer_head *bh, *head; u8 *kmirr; runlist_element *rl; @@ -479,6 +481,8 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, ntfs_debug("Entering for inode 0x%lx.", mft_no); BUG_ON(!max_bhs); + if (WARN_ON(max_bhs > MAX_BHS)) + return -EINVAL; if (unlikely(!vol->mftmirr_ino)) { /* This could happen during umount... */ err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); @@ -674,7 +678,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) unsigned int blocksize = vol->sb->s_blocksize; unsigned char blocksize_bits = vol->sb->s_blocksize_bits; int max_bhs = vol->mft_record_size / blocksize; - struct buffer_head *bhs[max_bhs]; + struct buffer_head *bhs[MAX_BHS]; struct buffer_head *bh, *head; runlist_element *rl; unsigned int block_start, block_end, m_start, m_end; @@ -684,6 +688,10 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) BUG_ON(NInoAttr(ni)); BUG_ON(!max_bhs); BUG_ON(!PageLocked(page)); + if (WARN_ON(max_bhs > MAX_BHS)) { + err = -EINVAL; + goto err_out; + } /* * If the ntfs_inode is clean no need to do anything. If it is dirty, * mark it as clean now so that it can be redirtied later on if needed. -- cgit From 93f5920d8607c5e3f2d3b159377a7e7d7875ffdd Mon Sep 17 00:00:00 2001 From: Jun Piao Date: Fri, 17 Aug 2018 15:44:24 -0700 Subject: ocfs2: return -EROFS when filesystem becomes read-only We should return -EROFS rather than other errno if filesystem becomes read-only. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/5B191B26.9010501@huawei.com Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 43 +++++++++++++++++++------------------------ fs/ocfs2/localalloc.c | 9 ++++----- fs/ocfs2/quota_local.c | 15 +++++++-------- 3 files changed, 30 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0f157bbd3e0f..676714fef869 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1481,19 +1481,17 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent list (next_free_rec == 0)\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - status = -EIO; + status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent list (next_free_rec == 0)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); goto bail; } i = le16_to_cpu(el->l_next_free_rec) - 1; blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has extent list where extent # %d has no physical block start\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); - status = -EIO; + status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent list where extent # %d has no physical block start\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); goto bail; } @@ -3214,11 +3212,10 @@ rightmost_no_delete: goto rightmost_no_delete; if (le16_to_cpu(el->l_next_free_rec) == 0) { - ret = -EIO; - ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), - "Owner %llu has empty extent block at %llu\n", - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - (unsigned long long)le64_to_cpu(eb->h_blkno)); + ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; } @@ -4411,12 +4408,11 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, le16_to_cpu(new_el->l_count)) { bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(sb, - "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), - le16_to_cpu(new_el->l_next_free_rec), - le16_to_cpu(new_el->l_count)); - status = -EINVAL; + status = ocfs2_error(sb, + "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); goto free_left_path; } rec = &new_el->l_recs[ @@ -4466,11 +4462,10 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(sb, - "Extent block #%llu has an invalid l_next_free_rec of %d\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), - le16_to_cpu(new_el->l_next_free_rec)); - status = -EINVAL; + status = ocfs2_error(sb, + "Extent block #%llu has an invalid l_next_free_rec of %d\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); goto free_right_path; } rec = &new_el->l_recs[1]; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index fe0d1f9571bb..7642b6712c39 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -663,11 +663,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, #ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", - (unsigned long long)le64_to_cpu(alloc->i_blkno), - le32_to_cpu(alloc->id1.bitmap1.i_used), - ocfs2_local_alloc_count_bits(alloc)); - status = -EIO; + status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n", + (unsigned long long)le64_to_cpu(alloc->i_blkno), + le32_to_cpu(alloc->id1.bitmap1.i_used), + ocfs2_local_alloc_count_bits(alloc)); goto bail; } #endif diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 16c42ed0dca8..b1a8b046f4c2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -137,14 +137,13 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, int rc = 0; struct buffer_head *tmp = *bh; - if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { - ocfs2_error(inode->i_sb, - "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)v_block, - (unsigned long long)i_size_read(inode)); - return -EIO; - } + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) + return ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) -- cgit From 229ba1f82abe4c942e2ab5862daafdfe471fedd8 Mon Sep 17 00:00:00 2001 From: wangyan Date: Fri, 17 Aug 2018 15:44:27 -0700 Subject: ocfs2: clean up some unnecessary code Several functions have some unnecessary code, clean up these code. Link: http://lkml.kernel.org/r/5B14DF72.5020800@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 17 ++++------------- fs/ocfs2/cluster/tcp.c | 2 -- fs/ocfs2/inode.c | 5 +---- 3 files changed, 5 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 676714fef869..a342f008e42f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -932,13 +932,11 @@ static int ocfs2_validate_extent_block(struct super_block *sb, goto bail; } - if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) rc = ocfs2_error(sb, "Extent block #%llu has an invalid h_fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(eb->h_fs_generation)); - goto bail; - } bail: return rc; } @@ -1596,10 +1594,8 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, * the new data. */ ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, meta_ac); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - goto out; - } out: if (final_depth) @@ -5518,10 +5514,8 @@ static int ocfs2_truncate_rec(handle_t *handle, ocfs2_journal_dirty(handle, path_leaf_bh(path)); ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); - if (ret) { + if (ret) mlog_errno(ret); - goto out; - } out: ocfs2_free_path(left_path); @@ -5654,10 +5648,8 @@ int ocfs2_remove_extent(handle_t *handle, ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, cpos, len); - if (ret) { + if (ret) mlog_errno(ret); - goto out; - } } out: @@ -5702,7 +5694,6 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto out; } } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1296f78ae966..7d9eea7d4a87 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -872,8 +872,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, "for type %u key %08x\n", msg_type, key); } write_unlock(&o2net_handler_lock); - if (ret) - goto out; out: if (ret) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index ddc3e9470c87..79279240fb6e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -637,10 +637,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, handle = NULL; status = ocfs2_commit_truncate(osb, inode, fe_bh); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto out; - } } out: @@ -1499,7 +1497,6 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, (unsigned long long)bh->b_blocknr, le32_to_cpu(di->i_fs_generation)); rc = -OCFS2_FILECHECK_ERR_GENERATION; - goto bail; } bail: -- cgit From 480bd56485b77c36e17a411921266c6f06623d98 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Aug 2018 15:44:31 -0700 Subject: ocfs2: make several functions and variables static (and some const) There are a variety of functions and variables that are local to the source and do not need to be in global scope, so make them static. Also make a couple of char arrays static const. Cleans up sparse warnings: symbol 'o2hb_heartbeat_mode_desc' was not declared. Should it be static? symbol 'o2hb_heartbeat_mode' was not declared. Should it be static? symbol 'o2hb_dependent_users' was not declared. Should it be static? symbol 'o2hb_region_dec_user' was not declared. Should it be static? symbol 'o2nm_fence_method_desc' was not declared. Should it be static? symbol 'lockdep_keys' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180628131659.12133-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 12 ++++++------ fs/ocfs2/cluster/nodemanager.c | 6 +++--- fs/ocfs2/dlmglue.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ea8c551bcd7e..9b2ed62dd638 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -127,13 +127,13 @@ enum o2hb_heartbeat_modes { O2HB_HEARTBEAT_NUM_MODES, }; -char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { - "local", /* O2HB_HEARTBEAT_LOCAL */ - "global", /* O2HB_HEARTBEAT_GLOBAL */ +static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ }; unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; -unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; +static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; /* * o2hb_dependent_users tracks the number of registered callbacks that depend @@ -141,7 +141,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; * However only o2dlm depends on the heartbeat. It does not want the heartbeat * to stop while a dlm domain is still active. */ -unsigned int o2hb_dependent_users; +static unsigned int o2hb_dependent_users; /* * In global heartbeat mode, all regions are pinned if there are one or more @@ -2486,7 +2486,7 @@ unlock: return ret; } -void o2hb_region_dec_user(const char *region_uuid) +static void o2hb_region_dec_user(const char *region_uuid) { spin_lock(&o2hb_live_lock); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index da64c3a20eeb..0e4166cc23a0 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,9 +35,9 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; -char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { - "reset", /* O2NM_FENCE_RESET */ - "panic", /* O2NM_FENCE_PANIC */ +static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ }; static inline void o2nm_lock_subsystem(void); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 0ff424c6d17c..8e712b614e6e 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -96,7 +96,7 @@ struct ocfs2_unblock_ctl { }; /* Lockdep class keys */ -struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; +static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES]; static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, int new_level); -- cgit From 6cd00a01f0c1ae6a852b09c59b8dd55cc6c35d1d Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 17 Aug 2018 15:44:34 -0700 Subject: fs/dcache.c: fix kmemcheck splat at take_dentry_name_snapshot() Since only dentry->d_name.len + 1 bytes out of DNAME_INLINE_LEN bytes are initialized at __d_alloc(), we can't copy the whole size unconditionally. WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff8fa27465ac50) 636f6e66696766732e746d70000000000010000000000000020000000188ffff i i i i i i i i i i i i i u u u u u u u u u u i i i i i u u u u ^ RIP: 0010:take_dentry_name_snapshot+0x28/0x50 RSP: 0018:ffffa83000f5bdf8 EFLAGS: 00010246 RAX: 0000000000000020 RBX: ffff8fa274b20550 RCX: 0000000000000002 RDX: ffffa83000f5be40 RSI: ffff8fa27465ac50 RDI: ffffa83000f5be60 RBP: ffffa83000f5bdf8 R08: ffffa83000f5be48 R09: 0000000000000001 R10: ffff8fa27465ac00 R11: ffff8fa27465acc0 R12: ffff8fa27465ac00 R13: ffff8fa27465acc0 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f79737ac8c0(0000) GS:ffffffff8fc30000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8fa274c0b000 CR3: 0000000134aa7002 CR4: 00000000000606f0 take_dentry_name_snapshot+0x28/0x50 vfs_rename+0x128/0x870 SyS_rename+0x3b2/0x3d0 entry_SYSCALL_64_fastpath+0x1a/0xa4 0xffffffffffffffff Link: http://lkml.kernel.org/r/201709131912.GBG39012.QMJLOVFSFFOOtH@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Cc: Vegard Nossum Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 8d2ec4898c2b..2e7e8d85e9b4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry spin_unlock(&dentry->d_lock); name->name = p->name; } else { - memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + memcpy(name->inline_name, dentry->d_iname, + dentry->d_name.len + 1); spin_unlock(&dentry->d_lock); name->name = name->inline_name; } -- cgit From 4cdfffc8722e99be8d400d8fa1fcd615d078ad43 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Aug 2018 15:44:37 -0700 Subject: vfs: discard ATTR_ATTR_FLAG This flag was introduce in 2.1.37pre1 and the only place it was tested was removed in 2.1.43pre1. The flag was never set. Let's discard it properly. Link: http://lkml.kernel.org/r/877en0hewz.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index cb8374af08a6..33b8423ef0c9 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -19,7 +19,7 @@ #define HOSTFS_ATTR_ATIME_SET 128 #define HOSTFS_ATTR_MTIME_SET 256 -/* These two are unused by hostfs. */ +/* This one is unused by hostfs. */ #define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ #define HOSTFS_ATTR_ATTR_FLAG 1024 -- cgit From 1f4aace60b0edc2d885aaa263abf4df42c8c65a8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Aug 2018 15:44:41 -0700 Subject: fs/seq_file.c: simplify seq_file iteration code and interface The documentation for seq_file suggests that it is necessary to be able to move the iterator to a given offset, however that is not the case. If the iterator is stored in the private data and is stable from one read() syscall to the next, it is only necessary to support first/next interactions. Implementing this in a client is a little clumsy. - if ->start() is given a pos of zero, it should go to start of sequence. - if ->start() is given the name pos that was given to the most recent next() or start(), it should restore the iterator to state just before that last call - if ->start is given another number, it should set the iterator one beyond the start just before the last ->start or ->next call. Also, the documentation says that the implementation can interpret the pos however it likes (other than zero meaning start), but seq_file increments the pos sometimes which does impose on the implementation. This patch simplifies the interface for first/next iteration and simplifies the code, while maintaining complete backward compatability. Now: - if ->start() is given a pos of zero, it should return an iterator placed at the start of the sequence - if ->start() is given a non-zero pos, it should return the iterator in the same state it was after the last ->start or ->next. This is particularly useful for interators which walk the multiple chains in a hash table, e.g. using rhashtable_walk*. See fs/gfs2/glock.c and drivers/staging/lustre/lustre/llite/vvp_dev.c A large part of achieving this is to *always* call ->next after ->show has successfully stored all of an entry in the buffer. Never just increment the index instead. Also: - always pass &m->index to ->start() and ->next(), never a temp variable - don't clear ->from when ->count is zero, as ->from is dead when ->count is zero. Some ->next functions do not increment *pos when they return NULL. To maintain compatability with this, we still need to increment m->index in one place, if ->next didn't increment it. Note that such ->next functions are buggy and should be fixed. A simple demonstration is dd if=/proc/swaps bs=1000 skip=1 Choose any block size larger than the size of /proc/swaps. This will always show the whole last line of /proc/swaps. This patch doesn't work around buggy next() functions for this case. [neilb@suse.com: ensure ->from is valid] Link: http://lkml.kernel.org/r/87601ryb8a.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Acked-by: Jonathan Corbet [docs] Tested-by: Jann Horn Cc: Alexander Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 54 +++++++++++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 4cc090b50cc5..1dea7a8a5255 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -90,23 +90,22 @@ EXPORT_SYMBOL(seq_open); static int traverse(struct seq_file *m, loff_t offset) { - loff_t pos = 0, index; + loff_t pos = 0; int error = 0; void *p; m->version = 0; - index = 0; + m->index = 0; m->count = m->from = 0; - if (!offset) { - m->index = index; + if (!offset) return 0; - } + if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } - p = m->op->start(m, &index); + p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) @@ -123,20 +122,15 @@ static int traverse(struct seq_file *m, loff_t offset) if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; - m->index = index; break; } pos += m->count; m->count = 0; - if (pos == offset) { - index++; - m->index = index; + p = m->op->next(m, p, &m->index); + if (pos == offset) break; - } - p = m->op->next(m, p, &index); } m->op->stop(m, p); - m->index = index; return error; Eoverflow: @@ -160,7 +154,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *m = file->private_data; size_t copied = 0; - loff_t pos; size_t n; void *p; int err = 0; @@ -223,16 +216,12 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) { - m->from = 0; - m->index++; - } if (!size) goto Done; } /* we need at least one record in buffer */ - pos = m->index; - p = m->op->start(m, &pos); + m->from = 0; + p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) @@ -243,8 +232,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (unlikely(err)) m->count = 0; if (unlikely(!m->count)) { - p = m->op->next(m, p, &pos); - m->index = pos; + p = m->op->next(m, p, &m->index); continue; } if (m->count < m->size) @@ -256,29 +244,33 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (!m->buf) goto Enomem; m->version = 0; - pos = m->index; - p = m->op->start(m, &pos); + p = m->op->start(m, &m->index); } m->op->stop(m, p); m->count = 0; goto Done; Fill: /* they want more? let's try to get some more */ - while (m->count < size) { + while (1) { size_t offs = m->count; - loff_t next = pos; - p = m->op->next(m, p, &next); + loff_t pos = m->index; + + p = m->op->next(m, p, &m->index); + if (pos == m->index) + /* Buggy ->next function */ + m->index++; if (!p || IS_ERR(p)) { err = PTR_ERR(p); break; } + if (m->count >= size) + break; err = m->op->show(m, p); if (seq_has_overflowed(m) || err) { m->count = offs; if (likely(err <= 0)) break; } - pos = next; } m->op->stop(m, p); n = min(m->count, size); @@ -287,11 +279,7 @@ Fill: goto Efault; copied += n; m->count -= n; - if (m->count) - m->from = n; - else - pos++; - m->index = pos; + m->from = n; Done: if (!copied) copied = err; -- cgit From 357c1206520da7a40e383fe329ce379bda722cd9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:32 -0700 Subject: mpage: add argument structure for do_mpage_readpage() Patch series "Submit ->readpages() IO as read-ahead", v4. The only caller of ->readpages() is from read-ahead, yet we don't submit IO flagged with REQ_RAHEAD. This means we don't see it in blktrace, for instance, which is a shame. Additionally, it's preventing further functional changes in the block layer for deadling with read-ahead more intelligently. We already make assumptions about ->readpages() just being for read-ahead in the mpage implementation, using readahead_gfp_mask(mapping) as out GFP mask of choice. This small series fixes up mpage_readpages() to submit with REQ_RAHEAD, which takes care of file systems using mpage_readpages(). The first patch is a prep patch, that makes do_mpage_readpage() take an argument structure. This patch (of 4): We're currently passing 8 arguments to this function, clean it up a bit by packing the arguments in an args structure we pass to it. No intentional functional changes in this patch. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20180621010725.17813-2-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Christoph Hellwig Cc: Theodore Ts'o Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/mpage.c | 109 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 57 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/mpage.c b/fs/mpage.c index b73638db9866..6dc90e456abf 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -133,6 +133,17 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) } while (page_bh != head); } +struct mpage_readpage_args { + struct bio *bio; + struct page *page; + unsigned int nr_pages; + sector_t last_block_in_bio; + struct buffer_head map_bh; + unsigned long first_logical_block; + get_block_t *get_block; + gfp_t gfp; +}; + /* * This is the worker routine which does all the work of mapping the disk * blocks and constructs largest possible bios, submits them for IO if the @@ -142,16 +153,14 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ -static struct bio * -do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, - sector_t *last_block_in_bio, struct buffer_head *map_bh, - unsigned long *first_logical_block, get_block_t get_block, - gfp_t gfp) +static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) { + struct page *page = args->page; struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; + struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -168,7 +177,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, goto confused; block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + nr_pages * blocks_per_page; + last_block = block_in_file + args->nr_pages * blocks_per_page; last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -178,9 +187,10 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, * Map blocks using the result from the previous get_blocks call first. */ nblocks = map_bh->b_size >> blkbits; - if (buffer_mapped(map_bh) && block_in_file > *first_logical_block && - block_in_file < (*first_logical_block + nblocks)) { - unsigned map_offset = block_in_file - *first_logical_block; + if (buffer_mapped(map_bh) && + block_in_file > args->first_logical_block && + block_in_file < (args->first_logical_block + nblocks)) { + unsigned map_offset = block_in_file - args->first_logical_block; unsigned last = nblocks - map_offset; for (relative_block = 0; ; relative_block++) { @@ -208,9 +218,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, if (block_in_file < last_block) { map_bh->b_size = (last_block-block_in_file) << blkbits; - if (get_block(inode, block_in_file, map_bh, 0)) + if (args->get_block(inode, block_in_file, map_bh, 0)) goto confused; - *first_logical_block = block_in_file; + args->first_logical_block = block_in_file; } if (!buffer_mapped(map_bh)) { @@ -273,43 +283,45 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, /* * This page will go to BIO. Do we need to send this BIO off first? */ - if (bio && (*last_block_in_bio != blocks[0] - 1)) - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); alloc_new: - if (bio == NULL) { + if (args->bio == NULL) { if (first_hole == blocks_per_page) { if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), page)) goto out; } - bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, nr_pages, BIO_MAX_PAGES), gfp); - if (bio == NULL) + args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), + min_t(int, args->nr_pages, + BIO_MAX_PAGES), + args->gfp); + if (args->bio == NULL) goto confused; } length = first_hole << blkbits; - if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + if (bio_add_page(args->bio, page, length, 0) < length) { + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); goto alloc_new; } - relative_block = block_in_file - *first_logical_block; + relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); else - *last_block_in_bio = blocks[blocks_per_page - 1]; + args->last_block_in_bio = blocks[blocks_per_page - 1]; out: - return bio; + return args->bio; confused: - if (bio) - bio = mpage_bio_submit(REQ_OP_READ, 0, bio); + if (args->bio) + args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); if (!PageUptodate(page)) - block_read_full_page(page, get_block); + block_read_full_page(page, args->get_block); else unlock_page(page); goto out; @@ -363,15 +375,12 @@ int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block) { - struct bio *bio = NULL; + struct mpage_readpage_args args = { + .get_block = get_block, + .gfp = readahead_gfp_mask(mapping), + }; unsigned page_idx; - sector_t last_block_in_bio = 0; - struct buffer_head map_bh; - unsigned long first_logical_block = 0; - gfp_t gfp = readahead_gfp_mask(mapping); - map_bh.b_state = 0; - map_bh.b_size = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = lru_to_page(pages); @@ -379,18 +388,16 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - gfp)) { - bio = do_mpage_readpage(bio, page, - nr_pages - page_idx, - &last_block_in_bio, &map_bh, - &first_logical_block, - get_block, gfp); + args.gfp)) { + args.page = page; + args.nr_pages = nr_pages - page_idx; + args.bio = do_mpage_readpage(&args); } put_page(page); } BUG_ON(!list_empty(pages)); - if (bio) - mpage_bio_submit(REQ_OP_READ, 0, bio); + if (args.bio) + mpage_bio_submit(REQ_OP_READ, 0, args.bio); return 0; } EXPORT_SYMBOL(mpage_readpages); @@ -400,18 +407,16 @@ EXPORT_SYMBOL(mpage_readpages); */ int mpage_readpage(struct page *page, get_block_t get_block) { - struct bio *bio = NULL; - sector_t last_block_in_bio = 0; - struct buffer_head map_bh; - unsigned long first_logical_block = 0; - gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + struct mpage_readpage_args args = { + .page = page, + .nr_pages = 1, + .get_block = get_block, + .gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL), + }; - map_bh.b_state = 0; - map_bh.b_size = 0; - bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, - &map_bh, &first_logical_block, get_block, gfp); - if (bio) - mpage_bio_submit(REQ_OP_READ, 0, bio); + args.bio = do_mpage_readpage(&args); + if (args.bio) + mpage_bio_submit(REQ_OP_READ, 0, args.bio); return 0; } EXPORT_SYMBOL(mpage_readpage); -- cgit From 74c8164e1cdb1eb22f1d49d54e515e81821a8ad0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:36 -0700 Subject: mpage: mpage_readpages() should submit IO as read-ahead a_ops->readpages() is only ever used for read-ahead, yet we don't flag the IO being submitted as such. Fix that up. Any file system that uses mpage_readpages() as its ->readpages() implementation will now get this right. Since we're passing in whether the IO is read-ahead or not, we don't need to pass in the 'gfp' separately, as it is dependent on the IO being read-ahead. Kill off that member. Add some documentation notes on ->readpages() being purely for read-ahead. Link: http://lkml.kernel.org/r/20180621010725.17813-3-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Chris Mason Cc: Christoph Hellwig Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 5 +++++ fs/mpage.c | 29 +++++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8f931d699287..b7c9b58acf3e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1421,6 +1421,11 @@ out: /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. + * + * Note that the aops->readpages() function is ONLY used for read-ahead. If + * this function ever deviates from doing just read-ahead, it should either + * use ->readpage() or do the necessary surgery to decouple ->readpages() + * readom read-ahead. */ static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, diff --git a/fs/mpage.c b/fs/mpage.c index 6dc90e456abf..c820dc9bebab 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -137,11 +137,11 @@ struct mpage_readpage_args { struct bio *bio; struct page *page; unsigned int nr_pages; + bool is_readahead; sector_t last_block_in_bio; struct buffer_head map_bh; unsigned long first_logical_block; get_block_t *get_block; - gfp_t gfp; }; /* @@ -170,8 +170,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct block_device *bdev = NULL; int length; int fully_mapped = 1; + int op_flags; unsigned nblocks; unsigned relative_block; + gfp_t gfp; + + if (args->is_readahead) { + op_flags = REQ_RAHEAD; + gfp = readahead_gfp_mask(page->mapping); + } else { + op_flags = 0; + gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + } if (page_has_buffers(page)) goto confused; @@ -284,7 +294,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * This page will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); alloc_new: if (args->bio == NULL) { @@ -296,14 +306,14 @@ alloc_new: args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, args->nr_pages, BIO_MAX_PAGES), - args->gfp); + gfp); if (args->bio == NULL) goto confused; } length = first_hole << blkbits; if (bio_add_page(args->bio, page, length, 0) < length) { - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); goto alloc_new; } @@ -311,7 +321,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); else args->last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -319,7 +329,7 @@ out: confused: if (args->bio) - args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio); + args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio); if (!PageUptodate(page)) block_read_full_page(page, args->get_block); else @@ -377,7 +387,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, { struct mpage_readpage_args args = { .get_block = get_block, - .gfp = readahead_gfp_mask(mapping), + .is_readahead = true, }; unsigned page_idx; @@ -388,7 +398,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - args.gfp)) { + readahead_gfp_mask(mapping))) { args.page = page; args.nr_pages = nr_pages - page_idx; args.bio = do_mpage_readpage(&args); @@ -397,7 +407,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, } BUG_ON(!list_empty(pages)); if (args.bio) - mpage_bio_submit(REQ_OP_READ, 0, args.bio); + mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); return 0; } EXPORT_SYMBOL(mpage_readpages); @@ -411,7 +421,6 @@ int mpage_readpage(struct page *page, get_block_t get_block) .page = page, .nr_pages = 1, .get_block = get_block, - .gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL), }; args.bio = do_mpage_readpage(&args); -- cgit From 5e9d398240b2292b1091f921d29bbab374b755fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:39 -0700 Subject: btrfs: readpages() should submit IO as read-ahead a_ops->readpages() is only ever used for read-ahead. Ensure that we pass this information down to the block layer. Link: http://lkml.kernel.org/r/20180621010725.17813-4-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Chris Mason Cc: Christoph Hellwig Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 628f1aef34b0..4dd6faab02bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3102,7 +3102,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, 0, prev_em_start); + bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -- cgit From ac22b46a0b65dbeccbf4d458db95687e825bde90 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Aug 2018 15:45:42 -0700 Subject: ext4: readpages() should submit IO as read-ahead a_ops->readpages() is only ever used for read-ahead. Ensure that we pass this information down to the block layer. Link: http://lkml.kernel.org/r/20180621010725.17813-5-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Andrew Morton Cc: Al Viro Cc: Chris Mason Cc: Christoph Hellwig Cc: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 5 +++-- fs/ext4/readpage.c | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1fc013f3d944..0f0edd1cd0cd 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3062,7 +3062,7 @@ static inline void ext4_set_de_type(struct super_block *sb, /* readpages.c */ extern int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, - unsigned nr_pages); + unsigned nr_pages, bool is_readahead); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8f6ad7667974..d0dd585add6a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3325,7 +3325,8 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page, 1); + return ext4_mpage_readpages(page->mapping, NULL, page, 1, + false); return ret; } @@ -3340,7 +3341,7 @@ ext4_readpages(struct file *file, struct address_space *mapping, if (ext4_has_inline_data(inode)) return 0; - return ext4_mpage_readpages(mapping, pages, NULL, nr_pages); + return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); } static void ext4_invalidatepage(struct page *page, unsigned int offset, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 19b87a8de6ff..f461d75ac049 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -98,7 +98,7 @@ static void mpage_end_io(struct bio *bio) int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, - unsigned nr_pages) + unsigned nr_pages, bool is_readahead) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -259,7 +259,8 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio_set_op_attrs(bio, REQ_OP_READ, + is_readahead ? REQ_RAHEAD : 0); } length = first_hole << blkbits; -- cgit From d46eb14b735b11927d4bdc2d1854c311af19de6d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Aug 2018 15:46:39 -0700 Subject: fs: fsnotify: account fsnotify metadata to kmemcg Patch series "Directed kmem charging", v8. The Linux kernel's memory cgroup allows limiting the memory usage of the jobs running on the system to provide isolation between the jobs. All the kernel memory allocated in the context of the job and marked with __GFP_ACCOUNT will also be included in the memory usage and be limited by the job's limit. The kernel memory can only be charged to the memcg of the process in whose context kernel memory was allocated. However there are cases where the allocated kernel memory should be charged to the memcg different from the current processes's memcg. This patch series contains two such concrete use-cases i.e. fsnotify and buffer_head. The fsnotify event objects can consume a lot of system memory for large or unlimited queues if there is either no or slow listener. The events are allocated in the context of the event producer. However they should be charged to the event consumer. Similarly the buffer_head objects can be allocated in a memcg different from the memcg of the page for which buffer_head objects are being allocated. To solve this issue, this patch series introduces mechanism to charge kernel memory to a given memcg. In case of fsnotify events, the memcg of the consumer can be used for charging and for buffer_head, the memcg of the page can be charged. For directed charging, the caller can use the scope API memalloc_[un]use_memcg() to specify the memcg to charge for all the __GFP_ACCOUNT allocations within the scope. This patch (of 2): A lot of memory can be consumed by the events generated for the huge or unlimited queues if there is either no or slow listener. This can cause system level memory pressure or OOMs. So, it's better to account the fsnotify kmem caches to the memcg of the listener. However the listener can be in a different memcg than the memcg of the producer and these allocations happen in the context of the event producer. This patch introduces remote memcg charging API which the producer can use to charge the allocations to the memcg of the listener. There are seven fsnotify kmem caches and among them allocations from dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and inotify_inode_mark_cachep happens in the context of syscall from the listener. So, SLAB_ACCOUNT is enough for these caches. The objects from fsnotify_mark_connector_cachep are not accounted as they are small compared to the notification mark or events and it is unclear whom to account connector to since it is shared by all events attached to the inode. The allocations from the event caches happen in the context of the event producer. For such caches we will need to remote charge the allocations to the listener's memcg. Thus we save the memcg reference in the fsnotify_group structure of the listener. This patch has also moved the members of fsnotify_group to keep the size same, at least for 64 bit build, even with additional member by filling the holes. [shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it] Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Jan Kara Cc: Amir Goldstein Cc: Greg Thelen Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 5 +++-- fs/notify/fanotify/fanotify.c | 14 ++++++++++---- fs/notify/fanotify/fanotify_user.c | 5 ++++- fs/notify/group.c | 3 +++ fs/notify/inotify/inotify_fsnotify.c | 7 ++++++- fs/notify/inotify/inotify_user.c | 5 ++++- 6 files changed, 30 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index e2bea2ac5dfb..a6365e6bc047 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -384,8 +384,9 @@ out_err: static int __init dnotify_init(void) { - dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); - dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC); + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, + SLAB_PANIC|SLAB_ACCOUNT); + dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index f90842efea13..eb4e75175cfb 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "fanotify.h" @@ -140,8 +141,8 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const struct path *path) { - struct fanotify_event_info *event; - gfp_t gfp = GFP_KERNEL; + struct fanotify_event_info *event = NULL; + gfp_t gfp = GFP_KERNEL_ACCOUNT; /* * For queues with unlimited length lost events are not expected and @@ -151,19 +152,22 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, if (group->max_events == UINT_MAX) gfp |= __GFP_NOFAIL; + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) - return NULL; + goto out; event = &pevent->fae; pevent->response = 0; goto init; } event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) - return NULL; + goto out; init: __maybe_unused fsnotify_init_event(&event->fse, inode, mask); event->tgid = get_pid(task_tgid(current)); @@ -174,6 +178,8 @@ init: __maybe_unused event->path.mnt = NULL; event->path.dentry = NULL; } +out: + memalloc_unuse_memcg(); return event; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ec4d8c59d0e3..0cf45041dc32 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); + group->memcg = get_mem_cgroup_from_mm(current->mm); oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { @@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, + SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = diff --git a/fs/notify/group.c b/fs/notify/group.c index aa5468f23e45..c03b83662876 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "fsnotify.h" @@ -36,6 +37,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) if (group->ops->free_group_priv) group->ops->free_group_priv(group); + mem_cgroup_put(group->memcg); + kfree(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 9ab6dde38a14..f4184b4f3815 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "inotify.h" @@ -98,7 +99,11 @@ int inotify_handle_event(struct fsnotify_group *group, i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - event = kmalloc(alloc_len, GFP_KERNEL); + /* Whoever is interested in the event, pays for the allocation. */ + memalloc_use_memcg(group->memcg); + event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT); + memalloc_unuse_memcg(); + if (unlikely(!event)) { /* * Treat lost event due to ENOMEM the same way as queue diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1cf5b779d862..749c46ababa0 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "inotify.h" #include "../fdinfo.h" @@ -636,6 +637,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) oevent->name_len = 0; group->max_events = max_events; + group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); @@ -808,7 +810,8 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); - inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, + SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; -- cgit From f745c6f5fe75734f3b35d9d4e6ebe2a7d010ddda Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Aug 2018 15:46:44 -0700 Subject: fs, mm: account buffer_head to kmemcg The buffer_head can consume a significant amount of system memory and is directly related to the amount of page cache. In our production environment we have observed that a lot of machines are spending a significant amount of memory as buffer_head and can not be left as system memory overhead. Charging buffer_head is not as simple as adding __GFP_ACCOUNT to the allocation. The buffer_heads can be allocated in a memcg different from the memcg of the page for which buffer_heads are being allocated. One concrete example is memory reclaim. The reclaim can trigger I/O of pages of any memcg on the system. So, the right way to charge buffer_head is to extract the memcg from the page for which buffer_heads are being allocated and then use targeted memcg charging API. [shakeelb@google.com: use __GFP_ACCOUNT for directed memcg charging] Link: http://lkml.kernel.org/r/20180702220208.213380-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20180627191250.209150-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Jan Kara Cc: Amir Goldstein Cc: Greg Thelen Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index c8c2b7d8b8d6..4cc679d5bf58 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -45,6 +45,7 @@ #include #include #include +#include #include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -813,12 +814,16 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { struct buffer_head *bh, *head; - gfp_t gfp = GFP_NOFS; + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; + struct mem_cgroup *memcg; if (retry) gfp |= __GFP_NOFAIL; + memcg = get_mem_cgroup_from_page(page); + memalloc_use_memcg(memcg); + head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { @@ -835,6 +840,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, /* Link the buffer to its page */ set_bh_page(bh, page, offset); } +out: + memalloc_unuse_memcg(); + mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. @@ -848,7 +856,7 @@ no_grow: } while (head); } - return NULL; + goto out; } EXPORT_SYMBOL_GPL(alloc_page_buffers); -- cgit From 2b3648a6ff83bd2a59b427d3537cc570933659b5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:45 -0700 Subject: fs/super.c: refactor alloc_super() Do two list_lru_init_memcg() calls after prealloc_super(). destroy_unused_super() in fail path is OK with this. Next patch needs such the order. Link: http://lkml.kernel.org/r/153063058712.1818.3382490999719078571.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 50728d9c1a05..78227c4ddb21 100644 --- a/fs/super.c +++ b/fs/super.c @@ -244,10 +244,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); - if (list_lru_init_memcg(&s->s_dentry_lru)) - goto fail; - if (list_lru_init_memcg(&s->s_inode_lru)) - goto fail; s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); @@ -265,6 +261,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink)) goto fail; + if (list_lru_init_memcg(&s->s_dentry_lru)) + goto fail; + if (list_lru_init_memcg(&s->s_inode_lru)) + goto fail; return s; fail: -- cgit From c92e8e10cafeaaedc84f23fed1bfcf9cf07399c2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:47:50 -0700 Subject: fs: propagate shrinker::id to list_lru Add list_lru::shrinker_id field and populate it by registered shrinker id. This will be used to set correct bit in memcg shrinkers map by lru code in next patches, after there appeared the first related to memcg element in list_lru. Link: http://lkml.kernel.org/r/153063059758.1818.14866596416857717800.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 78227c4ddb21..f5f96e52e0cd 100644 --- a/fs/super.c +++ b/fs/super.c @@ -261,9 +261,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru)) + if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_inode_lru)) + if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) goto fail; return s; -- cgit From 9b996468cfdba09f688f52dba4287de596194613 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 17 Aug 2018 15:48:21 -0700 Subject: mm: add SHRINK_EMPTY shrinker methods return value We need to distinguish the situations when shrinker has very small amount of objects (see vfs_pressure_ratio() called from super_cache_count()), and when it has no objects at all. Currently, in the both of these cases, shrinker::count_objects() returns 0. The patch introduces new SHRINK_EMPTY return value, which will be used for "no objects at all" case. It's is a refactoring mostly, as SHRINK_EMPTY is replaced by 0 by all callers of do_shrink_slab() in this patch, and all the magic will happen in further. Link: http://lkml.kernel.org/r/153063069574.1818.11037751256699341813.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov Tested-by: Shakeel Butt Cc: Al Viro Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Josef Bacik Cc: Li RongQing Cc: Matthew Wilcox Cc: Matthias Kaehlcke Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Philippe Ombredanne Cc: Roman Gushchin Cc: Sahitya Tummala Cc: Stephen Rothwell Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index f5f96e52e0cd..7429588d6b49 100644 --- a/fs/super.c +++ b/fs/super.c @@ -144,6 +144,9 @@ static unsigned long super_cache_count(struct shrinker *shrink, total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); + if (!total_objects) + return SHRINK_EMPTY; + total_objects = vfs_pressure_ratio(total_objects); return total_objects; } -- cgit From 5241d4727479aad77af50b80757c38268bfa4560 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Aug 2018 15:50:01 -0700 Subject: fs/userfaultfd.c: remove redundant pointer uwq Pointer uwq is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'uwq' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20180717090802.18357-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bad9cea37f12..15c265d450bf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1849,17 +1849,14 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; wait_queue_entry_t *wq; - struct userfaultfd_wait_queue *uwq; unsigned long pending = 0, total = 0; spin_lock(&ctx->fault_pending_wqh.lock); list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { - uwq = container_of(wq, struct userfaultfd_wait_queue, wq); pending++; total++; } list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { - uwq = container_of(wq, struct userfaultfd_wait_queue, wq); total++; } spin_unlock(&ctx->fault_pending_wqh.lock); -- cgit