From e1fb4a0864958fac2fb1b23f9f4562a9f90e3e8f Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 17 Aug 2018 15:43:40 -0700
Subject: dax: remove VM_MIXEDMAP for fsdax and device dax

This patch is reworked from an earlier patch that Dan has posted:
https://patchwork.kernel.org/patch/10131727/

VM_MIXEDMAP is used by dax to direct mm paths like vm_normal_page() that
the memory page it is dealing with is not typical memory from the linear
map.  The get_user_pages_fast() path, since it does not resolve the vma,
is already using {pte,pmd}_devmap() as a stand-in for VM_MIXEDMAP, so we
use that as a VM_MIXEDMAP replacement in some locations.  In the cases
where there is no pte to consult we fallback to using vma_is_dax() to
detect the VM_MIXEDMAP special case.

Now that we have explicit driver pfn_t-flag opt-in/opt-out for
get_user_pages() support for DAX we can stop setting VM_MIXEDMAP.  This
also means we no longer need to worry about safely manipulating vm_flags
in a future where we support dynamically changing the dax mode of a
file.

DAX should also now be supported with madvise_behavior(), vma_merge(),
and copy_page_range().

This patch has been tested against ndctl unit test.  It has also been
tested against xfstests commit: 625515d using fake pmem created by
memmap and no additional issues have been observed.

Link: http://lkml.kernel.org/r/152847720311.55924.16999195879201817653.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/file.c    | 1 -
 fs/ext4/file.c    | 2 +-
 fs/xfs/xfs_file.c | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 047c327a6b23..28b2609f25c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -126,7 +126,6 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	file_accessed(file);
 	vma->vm_ops = &ext2_dax_vm_ops;
-	vma->vm_flags |= VM_MIXEDMAP;
 	return 0;
 }
 #else
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7f8023340eb8..69d65d49837b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -374,7 +374,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
-		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+		vma->vm_flags |= VM_HUGEPAGE;
 	} else {
 		vma->vm_ops = &ext4_file_vm_ops;
 	}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 181e9084519b..5eaef2c17293 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1169,7 +1169,7 @@ xfs_file_mmap(
 	file_accessed(filp);
 	vma->vm_ops = &xfs_file_vm_ops;
 	if (IS_DAX(file_inode(filp)))
-		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+		vma->vm_flags |= VM_HUGEPAGE;
 	return 0;
 }
 
-- 
cgit 


From a3fda0ffeaf0114328024aee4a9ec3b08af4b077 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Aug 2018 15:43:47 -0700
Subject: fs/ufs: use ktime_get_real_seconds for sb and cg timestamps

get_seconds() is deprecated because of the 32-bit overflow and will be
removed.  All callers in ufs also truncate to a 32-bit number, so
nothing changes during the conversion, but this should be harmless as
the superblock and cylinder group timestamps are not visible to user
space, except for checking the fs-dirty state, wich works fine across
the overflow.

This moves the call to get_seconds() into a new inline function, with a
comment explaining the constraints, while converting it to
ktime_get_real_seconds().

Link: http://lkml.kernel.org/r/20180718115017.742609-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/balloc.c |  4 ++--
 fs/ufs/ialloc.c |  2 +-
 fs/ufs/super.c  |  4 ++--
 fs/ufs/util.h   | 14 ++++++++++++++
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index e727ee07dbe4..075d3d9114c8 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -547,7 +547,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 	/*
 	 * Block can be extended
 	 */
-	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+	ucg->cg_time = ufs_get_seconds(sb);
 	for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
 		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
 			break;
@@ -639,7 +639,7 @@ cg_found:
 	if (!ufs_cg_chkmagic(sb, ucg)) 
 		ufs_panic (sb, "ufs_alloc_fragments",
 			"internal error, bad magic number on cg %u", cgno);
-	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+	ucg->cg_time = ufs_get_seconds(sb);
 
 	if (count == uspi->s_fpb) {
 		result = ufs_alloccg_block (inode, ucpi, goal, err);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 02c0a4be4212..969fd60436d3 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -89,7 +89,7 @@ void ufs_free_inode (struct inode * inode)
 	if (!ufs_cg_chkmagic(sb, ucg))
 		ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
 
-	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+	ucg->cg_time = ufs_get_seconds(sb);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 488088141451..a4e07e910f1b 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -698,7 +698,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
 	usb1 = ubh_get_usb_first(uspi);
 	usb3 = ubh_get_usb_third(uspi);
 
-	usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+	usb1->fs_time = ufs_get_seconds(sb);
 	if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
 	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
 	    (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
@@ -1342,7 +1342,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	 */
 	if (*mount_flags & SB_RDONLY) {
 		ufs_put_super_internal(sb);
-		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+		usb1->fs_time = ufs_get_seconds(sb);
 		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 1907be6d5808..1fd3011ea623 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -590,3 +590,17 @@ static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi,
 	else
 		return *(__fs32 *)p == 0;
 }
+
+static inline __fs32 ufs_get_seconds(struct super_block *sbp)
+{
+	time64_t now = ktime_get_real_seconds();
+
+	/* Signed 32-bit interpretation wraps around in 2038, which
+	 * happens in ufs1 inode stamps but not ufs2 using 64-bits
+	 * stamps. For superblock and blockgroup, let's assume
+	 * unsigned 32-bit stamps, which are good until y2106.
+	 * Wrap around rather than clamp here to make the dirty
+	 * file system detection work in the superblock stamp.
+	 */
+	return cpu_to_fs32(sbp, lower_32_bits(now));
+}
-- 
cgit 


From bcf451ecfc8d45618d13c9e4abcbbd770af20cc9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Aug 2018 15:43:50 -0700
Subject: fs/ntfs: use timespec64 directly for timestamp conversion

Now that the VFS has been converted from timespec to timespec64
timestamps, only the conversion to/from ntfs timestamps uses 32-bit
seconds.

This changes that last missing piece to get the ntfs implementation
y2038 safe on 32-bit architectures.

Link: http://lkml.kernel.org/r/20180718115017.742609-2-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/inode.c | 12 ++++++------
 fs/ntfs/time.h  | 27 +++++++++++++++------------
 2 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index decaf75d1cd5..bd3221cbdd95 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -667,18 +667,18 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * mtime is the last change of the data within the file. Not changed
 	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
 	 */
-	vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time));
+	vi->i_mtime = ntfs2utc(si->last_data_change_time);
 	/*
 	 * ctime is the last change of the metadata of the file. This obviously
 	 * always changes, when mtime is changed. ctime can be changed on its
 	 * own, mtime is then not changed, e.g. when a file is renamed.
 	 */
-	vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time));
+	vi->i_ctime = ntfs2utc(si->last_mft_change_time);
 	/*
 	 * Last access to the data within the file. Not changed during a rename
 	 * for example but changed whenever the file is written to.
 	 */
-	vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time));
+	vi->i_atime = ntfs2utc(si->last_access_time);
 
 	/* Find the attribute list attribute if present. */
 	ntfs_attr_reinit_search_ctx(ctx);
@@ -2997,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset));
 	/* Update the access times if they have changed. */
-	nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime));
+	nt = utc2ntfs(vi->i_mtime);
 	if (si->last_data_change_time != nt) {
 		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino, (long long)
@@ -3006,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 		si->last_data_change_time = nt;
 		modified = true;
 	}
-	nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime));
+	nt = utc2ntfs(vi->i_ctime);
 	if (si->last_mft_change_time != nt) {
 		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino, (long long)
@@ -3015,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 		si->last_mft_change_time = nt;
 		modified = true;
 	}
-	nt = utc2ntfs(timespec64_to_timespec(vi->i_atime));
+	nt = utc2ntfs(vi->i_atime);
 	if (si->last_access_time != nt) {
 		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino,
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
index 01233989d5d1..24cd719f1fd2 100644
--- a/fs/ntfs/time.h
+++ b/fs/ntfs/time.h
@@ -36,16 +36,16 @@
  * Convert the Linux UTC time @ts to its corresponding NTFS time and return
  * that in little endian format.
  *
- * Linux stores time in a struct timespec consisting of a time_t (long at
- * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
- * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
- * 1-nano-second intervals since the value of tv_sec.
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
  *
  * NTFS uses Microsoft's standard time format which is stored in a s64 and is
  * measured as the number of 100-nano-second intervals since 1st January 1601,
  * 00:00:00 UTC.
  */
-static inline sle64 utc2ntfs(const struct timespec ts)
+static inline sle64 utc2ntfs(const struct timespec64 ts)
 {
 	/*
 	 * Convert the seconds to 100ns intervals, add the nano-seconds
@@ -63,7 +63,10 @@ static inline sle64 utc2ntfs(const struct timespec ts)
  */
 static inline sle64 get_current_ntfs_time(void)
 {
-	return utc2ntfs(current_kernel_time());
+	struct timespec64 ts;
+
+	ktime_get_coarse_real_ts64(&ts);
+	return utc2ntfs(ts);
 }
 
 /**
@@ -73,18 +76,18 @@ static inline sle64 get_current_ntfs_time(void)
  * Convert the little endian NTFS time @time to its corresponding Linux UTC
  * time and return that in cpu format.
  *
- * Linux stores time in a struct timespec consisting of a time_t (long at
- * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
- * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
- * 1-nano-second intervals since the value of tv_sec.
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
  *
  * NTFS uses Microsoft's standard time format which is stored in a s64 and is
  * measured as the number of 100 nano-second intervals since 1st January 1601,
  * 00:00:00 UTC.
  */
-static inline struct timespec ntfs2utc(const sle64 time)
+static inline struct timespec64 ntfs2utc(const sle64 time)
 {
-	struct timespec ts;
+	struct timespec64 ts;
 
 	/* Subtract the NTFS time offset. */
 	u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
-- 
cgit 


From f08957d0ffe91f346c47cef95139c54aa7275cfe Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Aug 2018 15:43:54 -0700
Subject: fs/hpfs: extend gmt_to_local() conversion to 64-bit times

The VFS timestamps are all 64-bit now, the only missing piece for hpfs
is the internal conversion function.  One interesting bit about hpfs is
that it can already deal with moving the 136 year window of its
timestamps to support a much wider range than other file systems with
32-bit timestamps.  It also treats the timestamps as 'unsigned' on
64-bit architectures (but signed on 32-bit, because time_t always around
to negative numbers in 2038).

Changing the conversion to use time64_t makes 32-bit architectures
behave the same way as 64-bit.  For completeness, this also adds a
clamp_t call for each conversion, so we don't wrap the timestamps but
instead stay within the [0..U32_MAX] range of the on-disk timestamps.

Link: http://lkml.kernel.org/r/20180718115017.742609-3-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hpfs/hpfs_fn.h | 13 ++++++++++---
 fs/hpfs/namei.c   | 12 ++++++------
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2a153aed4c19..ab2e7cc2ff33 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -334,16 +334,23 @@ long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg);
  * local time (HPFS) to GMT (Unix)
  */
 
-static inline time_t local_to_gmt(struct super_block *s, time32_t t)
+static inline time64_t local_to_gmt(struct super_block *s, time32_t t)
 {
 	extern struct timezone sys_tz;
 	return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift;
 }
 
-static inline time32_t gmt_to_local(struct super_block *s, time_t t)
+static inline time32_t gmt_to_local(struct super_block *s, time64_t t)
 {
 	extern struct timezone sys_tz;
-	return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+	t = t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+
+	return clamp_t(time64_t, t, 0, U32_MAX);
+}
+
+static inline time32_t local_get_seconds(struct super_block *s)
+{
+	return gmt_to_local(s, ktime_get_real_seconds());
 }
 
 /*
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a3615e4c730d..082b7c76dd0c 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
 
 static void hpfs_update_directory_times(struct inode *dir)
 {
-	time_t t = get_seconds();
+	time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
 	if (t == dir->i_mtime.tv_sec &&
 	    t == dir->i_ctime.tv_sec)
 		return;
@@ -50,7 +50,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	/*dee.archive = 0;*/
 	dee.hidden = name[0] == '.';
 	dee.fnode = cpu_to_le32(fno);
-	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
 	result = new_inode(dir->i_sb);
 	if (!result)
 		goto bail2;
@@ -91,7 +91,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	dnode->root_dnode = 1;
 	dnode->up = cpu_to_le32(fno);
 	de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0);
-	de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	de->creation_date = de->write_date = de->read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
 	if (!(mode & 0222)) de->read_only = 1;
 	de->first = de->directory = 1;
 	/*de->hidden = de->system = 0;*/
@@ -151,7 +151,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b
 	dee.archive = 1;
 	dee.hidden = name[0] == '.';
 	dee.fnode = cpu_to_le32(fno);
-	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
 
 	result = new_inode(dir->i_sb);
 	if (!result)
@@ -238,7 +238,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
 	dee.archive = 1;
 	dee.hidden = name[0] == '.';
 	dee.fnode = cpu_to_le32(fno);
-	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
 
 	result = new_inode(dir->i_sb);
 	if (!result)
@@ -314,7 +314,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	dee.archive = 1;
 	dee.hidden = name[0] == '.';
 	dee.fnode = cpu_to_le32(fno);
-	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+	dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
 
 	result = new_inode(dir->i_sb);
 	if (!result)
-- 
cgit 


From a10dcebacdb0cf6eb29c211e99cf190cd131a16a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 17 Aug 2018 15:44:04 -0700
Subject: fs/ntfs/aops.c: don't disable interrupts during kmap_atomic()

ntfs_end_buffer_async_read() disables interrupts around kmap_atomic().
This is a leftover from the old kmap_atomic() implementation which
relied on fixed mapping slots, so the caller had to make sure that the
same slot could not be reused from an interrupting context.

kmap_atomic() was changed to dynamic slots long ago and commit
1ec9c5ddc17a ("include/linux/highmem.h: remove the second argument of
k[un]map_atomic()") removed the slot assignements, but the callers were
not checked for now redundant interrupt disabling.

Remove the conditional interrupt disable.

Link: http://lkml.kernel.org/r/20180611144913.gln5mklhqcrfsoom@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Anton Altaparmakov <anton@tuxera.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/aops.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 3a2e509c77c5..01c770979921 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -93,13 +93,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 			ofs = 0;
 			if (file_ofs < init_size)
 				ofs = init_size - file_ofs;
-			local_irq_save(flags);
 			kaddr = kmap_atomic(page);
 			memset(kaddr + bh_offset(bh) + ofs, 0,
 					bh->b_size - ofs);
 			flush_dcache_page(page);
 			kunmap_atomic(kaddr);
-			local_irq_restore(flags);
 		}
 	} else {
 		clear_buffer_uptodate(bh);
@@ -146,13 +144,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		recs = PAGE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		local_irq_save(flags);
 		kaddr = kmap_atomic(page);
 		for (i = 0; i < recs; i++)
 			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
 		kunmap_atomic(kaddr);
-		local_irq_restore(flags);
 		flush_dcache_page(page);
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
-- 
cgit 


From ac4ecf968acb9e54c335f99d842d56d6b90e28fb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Aug 2018 15:44:07 -0700
Subject: ntfs: aops: remove VLA usage

In the quest to remove all stack VLA usage from the kernel[1], this uses
the maximum size needed on the stack and adds a sanity check for
robustness: index.block_size cannot be larger than PAGE_SIZE nor less
than NTFS_BLOCK_SIZE.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Link: http://lkml.kernel.org/r/20180626172909.41453-2-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Anton Altaparmakov <anton@tuxera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/aops.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 01c770979921..8946130c87ad 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -922,7 +922,7 @@ static int ntfs_write_mst_block(struct page *page,
 	ntfs_volume *vol = ni->vol;
 	u8 *kaddr;
 	unsigned int rec_size = ni->itype.index.block_size;
-	ntfs_inode *locked_nis[PAGE_SIZE / rec_size];
+	ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
 	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	runlist_element *rl;
@@ -931,6 +931,9 @@ static int ntfs_write_mst_block(struct page *page,
 	bool sync, is_mft, page_is_dirty, rec_is_dirty;
 	unsigned char bh_size_bits;
 
+	if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
+		return -EINVAL;
+
 	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
 			"0x%lx.", vi->i_ino, ni->type, page->index);
 	BUG_ON(!NInoNonResident(ni));
-- 
cgit 


From 2c27ce915078a5822aefb5db7bc2481664b26044 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Aug 2018 15:44:11 -0700
Subject: ntfs: decompress: remove VLA usage

In the quest to remove all stack VLA usage from the kernel[1], this
moves the stack buffer used during decompression to be allocated
externally.

The existing "dest_max_index" used in the VLA is bounded by cb_max_page.
cb_max_page is bounded by max_page, and max_page is bounded by nr_pages.
Since nr_pages is used for the "pages" allocation, it can similarly be
used for the "completed_pages" allocation and passed into the
decompression function.  The error paths are updated to free the new
allocation.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Link: http://lkml.kernel.org/r/20180626172909.41453-3-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Anton Altaparmakov <anton@tuxera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/compress.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index fbd0090d7d0c..df7c32b5fac7 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -128,6 +128,7 @@ static inline void handle_bounds_compressed_page(struct page *page,
 /**
  * ntfs_decompress - decompress a compression block into an array of pages
  * @dest_pages:		destination array of pages
+ * @completed_pages:	scratch space to track completed pages
  * @dest_index:		current index into @dest_pages (IN/OUT)
  * @dest_ofs:		current offset within @dest_pages[@dest_index] (IN/OUT)
  * @dest_max_index:	maximum index into @dest_pages (IN)
@@ -162,10 +163,10 @@ static inline void handle_bounds_compressed_page(struct page *page,
  * Note to hackers: This function may not sleep until it has finished accessing
  * the compression block @cb_start as it is a per-CPU buffer.
  */
-static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
-		int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
-		const int xpage, char *xpage_done, u8 *const cb_start,
-		const u32 cb_size, const loff_t i_size,
+static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
+		int *dest_index, int *dest_ofs, const int dest_max_index,
+		const int dest_max_ofs, const int xpage, char *xpage_done,
+		u8 *const cb_start, const u32 cb_size, const loff_t i_size,
 		const s64 initialized_size)
 {
 	/*
@@ -190,9 +191,6 @@ static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
 	/* Variables for tag and token parsing. */
 	u8 tag;			/* Current tag. */
 	int token;		/* Loop counter for the eight tokens in tag. */
-
-	/* Need this because we can't sleep, so need two stages. */
-	int completed_pages[dest_max_index - *dest_index + 1];
 	int nr_completed_pages = 0;
 
 	/* Default error code. */
@@ -516,6 +514,7 @@ int ntfs_read_compressed_block(struct page *page)
 	unsigned int cb_clusters, cb_max_ofs;
 	int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
 	struct page **pages;
+	int *completed_pages;
 	unsigned char xpage_done = 0;
 
 	ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
@@ -528,14 +527,16 @@ int ntfs_read_compressed_block(struct page *page)
 	BUG_ON(ni->name_len);
 
 	pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
+	completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
 
 	/* Allocate memory to store the buffer heads we need. */
 	bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
 	bhs = kmalloc(bhs_size, GFP_NOFS);
 
-	if (unlikely(!pages || !bhs)) {
+	if (unlikely(!pages || !bhs || !completed_pages)) {
 		kfree(bhs);
 		kfree(pages);
+		kfree(completed_pages);
 		unlock_page(page);
 		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
 		return -ENOMEM;
@@ -562,6 +563,7 @@ int ntfs_read_compressed_block(struct page *page)
 	if (xpage >= max_page) {
 		kfree(bhs);
 		kfree(pages);
+		kfree(completed_pages);
 		zero_user(page, 0, PAGE_SIZE);
 		ntfs_debug("Compressed read outside i_size - truncated?");
 		SetPageUptodate(page);
@@ -854,10 +856,10 @@ lock_retry_remap:
 		unsigned int prev_cur_page = cur_page;
 
 		ntfs_debug("Found compressed compression block.");
-		err = ntfs_decompress(pages, &cur_page, &cur_ofs,
-				cb_max_page, cb_max_ofs, xpage, &xpage_done,
-				cb_pos,	cb_size - (cb_pos - cb), i_size,
-				initialized_size);
+		err = ntfs_decompress(pages, completed_pages, &cur_page,
+				&cur_ofs, cb_max_page, cb_max_ofs, xpage,
+				&xpage_done, cb_pos, cb_size - (cb_pos - cb),
+				i_size, initialized_size);
 		/*
 		 * We can sleep from now on, lock already dropped by
 		 * ntfs_decompress().
@@ -912,6 +914,7 @@ lock_retry_remap:
 
 	/* We no longer need the list of pages. */
 	kfree(pages);
+	kfree(completed_pages);
 
 	/* If we have completed the requested page, we return success. */
 	if (likely(xpage_done))
@@ -956,5 +959,6 @@ err_out:
 		}
 	}
 	kfree(pages);
+	kfree(completed_pages);
 	return -EIO;
 }
-- 
cgit 


From ab62ef82ea49b8814f4b0e2fe61426acda793fb9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Aug 2018 15:44:14 -0700
Subject: ntfs: mft: remove VLA usage

In the quest to remove all stack VLA usage from the kernel[1], this
allocates the maximum size stack buffer.  Existing checks already
require that blocksize >= NTFS_BLOCK_SIZE and mft_record_size <=
PAGE_SIZE, so max_bhs can be at most PAGE_SIZE / NTFS_BLOCK_SIZE.
Sanity checks are added for robustness.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Link: http://lkml.kernel.org/r/20180626172909.41453-4-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Anton Altaparmakov <anton@tuxera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/mft.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 32c523cf5a2d..fb14d17666c8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -35,6 +35,8 @@
 #include "mft.h"
 #include "ntfs.h"
 
+#define MAX_BHS	(PAGE_SIZE / NTFS_BLOCK_SIZE)
+
 /**
  * map_mft_record_page - map the page in which a specific mft record resides
  * @ni:		ntfs inode whose mft record page to map
@@ -469,7 +471,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	struct page *page;
 	unsigned int blocksize = vol->sb->s_blocksize;
 	int max_bhs = vol->mft_record_size / blocksize;
-	struct buffer_head *bhs[max_bhs];
+	struct buffer_head *bhs[MAX_BHS];
 	struct buffer_head *bh, *head;
 	u8 *kmirr;
 	runlist_element *rl;
@@ -479,6 +481,8 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 
 	ntfs_debug("Entering for inode 0x%lx.", mft_no);
 	BUG_ON(!max_bhs);
+	if (WARN_ON(max_bhs > MAX_BHS))
+		return -EINVAL;
 	if (unlikely(!vol->mftmirr_ino)) {
 		/* This could happen during umount... */
 		err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
@@ -674,7 +678,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	unsigned int blocksize = vol->sb->s_blocksize;
 	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
 	int max_bhs = vol->mft_record_size / blocksize;
-	struct buffer_head *bhs[max_bhs];
+	struct buffer_head *bhs[MAX_BHS];
 	struct buffer_head *bh, *head;
 	runlist_element *rl;
 	unsigned int block_start, block_end, m_start, m_end;
@@ -684,6 +688,10 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	BUG_ON(NInoAttr(ni));
 	BUG_ON(!max_bhs);
 	BUG_ON(!PageLocked(page));
+	if (WARN_ON(max_bhs > MAX_BHS)) {
+		err = -EINVAL;
+		goto err_out;
+	}
 	/*
 	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
 	 * mark it as clean now so that it can be redirtied later on if needed.
-- 
cgit 


From 93f5920d8607c5e3f2d3b159377a7e7d7875ffdd Mon Sep 17 00:00:00 2001
From: Jun Piao <piaojun@huawei.com>
Date: Fri, 17 Aug 2018 15:44:24 -0700
Subject: ocfs2: return -EROFS when filesystem becomes read-only

We should return -EROFS rather than other errno if filesystem becomes
read-only.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/5B191B26.9010501@huawei.com
Signed-off-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Acked-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c       | 43 +++++++++++++++++++------------------------
 fs/ocfs2/localalloc.c  |  9 ++++-----
 fs/ocfs2/quota_local.c | 15 +++++++--------
 3 files changed, 30 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0f157bbd3e0f..676714fef869 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1481,19 +1481,17 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
 
 	while(le16_to_cpu(el->l_tree_depth) > 1) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
-			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu has empty extent list (next_free_rec == 0)\n",
-				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
-			status = -EIO;
+			status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+					"Owner %llu has empty extent list (next_free_rec == 0)\n",
+					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
 			goto bail;
 		}
 		i = le16_to_cpu(el->l_next_free_rec) - 1;
 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
 		if (!blkno) {
-			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu has extent list where extent # %d has no physical block start\n",
-				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
-			status = -EIO;
+			status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+					"Owner %llu has extent list where extent # %d has no physical block start\n",
+					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
 			goto bail;
 		}
 
@@ -3214,11 +3212,10 @@ rightmost_no_delete:
 			goto rightmost_no_delete;
 
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
-			ret = -EIO;
-			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu has empty extent block at %llu\n",
-				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-				    (unsigned long long)le64_to_cpu(eb->h_blkno));
+			ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+					"Owner %llu has empty extent block at %llu\n",
+					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+					(unsigned long long)le64_to_cpu(eb->h_blkno));
 			goto out;
 		}
 
@@ -4411,12 +4408,11 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 			    le16_to_cpu(new_el->l_count)) {
 				bh = path_leaf_bh(left_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
-				ocfs2_error(sb,
-					    "Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
-					    (unsigned long long)le64_to_cpu(eb->h_blkno),
-					    le16_to_cpu(new_el->l_next_free_rec),
-					    le16_to_cpu(new_el->l_count));
-				status = -EINVAL;
+				status = ocfs2_error(sb,
+						"Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
+						(unsigned long long)le64_to_cpu(eb->h_blkno),
+						le16_to_cpu(new_el->l_next_free_rec),
+						le16_to_cpu(new_el->l_count));
 				goto free_left_path;
 			}
 			rec = &new_el->l_recs[
@@ -4466,11 +4462,10 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
 				bh = path_leaf_bh(right_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
-				ocfs2_error(sb,
-					    "Extent block #%llu has an invalid l_next_free_rec of %d\n",
-					    (unsigned long long)le64_to_cpu(eb->h_blkno),
-					    le16_to_cpu(new_el->l_next_free_rec));
-				status = -EINVAL;
+				status = ocfs2_error(sb,
+						"Extent block #%llu has an invalid l_next_free_rec of %d\n",
+						(unsigned long long)le64_to_cpu(eb->h_blkno),
+						le16_to_cpu(new_el->l_next_free_rec));
 				goto free_right_path;
 			}
 			rec = &new_el->l_recs[1];
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index fe0d1f9571bb..7642b6712c39 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -663,11 +663,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
 	    ocfs2_local_alloc_count_bits(alloc)) {
-		ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
-			    (unsigned long long)le64_to_cpu(alloc->i_blkno),
-			    le32_to_cpu(alloc->id1.bitmap1.i_used),
-			    ocfs2_local_alloc_count_bits(alloc));
-		status = -EIO;
+		status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
+				(unsigned long long)le64_to_cpu(alloc->i_blkno),
+				le32_to_cpu(alloc->id1.bitmap1.i_used),
+				ocfs2_local_alloc_count_bits(alloc));
 		goto bail;
 	}
 #endif
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 16c42ed0dca8..b1a8b046f4c2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -137,14 +137,13 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 	int rc = 0;
 	struct buffer_head *tmp = *bh;
 
-	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
-		ocfs2_error(inode->i_sb,
-			    "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			    (unsigned long long)v_block,
-			    (unsigned long long)i_size_read(inode));
-		return -EIO;
-	}
+	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block)
+		return ocfs2_error(inode->i_sb,
+				"Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
+				(unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)v_block,
+				(unsigned long long)i_size_read(inode));
+
 	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
 				    ocfs2_validate_quota_block);
 	if (rc)
-- 
cgit 


From 229ba1f82abe4c942e2ab5862daafdfe471fedd8 Mon Sep 17 00:00:00 2001
From: wangyan <wangyan122@huawei.com>
Date: Fri, 17 Aug 2018 15:44:27 -0700
Subject: ocfs2: clean up some unnecessary code

Several functions have some unnecessary code, clean up these code.

Link: http://lkml.kernel.org/r/5B14DF72.5020800@huawei.com
Signed-off-by: Yan Wang <wangyan122@huawei.com>
Reviewed-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c       | 17 ++++-------------
 fs/ocfs2/cluster/tcp.c |  2 --
 fs/ocfs2/inode.c       |  5 +----
 3 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 676714fef869..a342f008e42f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -932,13 +932,11 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
 		goto bail;
 	}
 
-	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation)
 		rc = ocfs2_error(sb,
 				 "Extent block #%llu has an invalid h_fs_generation of #%u\n",
 				 (unsigned long long)bh->b_blocknr,
 				 le32_to_cpu(eb->h_fs_generation));
-		goto bail;
-	}
 bail:
 	return rc;
 }
@@ -1596,10 +1594,8 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
 	 * the new data. */
 	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
 			       meta_ac);
-	if (ret < 0) {
+	if (ret < 0)
 		mlog_errno(ret);
-		goto out;
-	}
 
 out:
 	if (final_depth)
@@ -5518,10 +5514,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
 
 	ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
-	if (ret) {
+	if (ret)
 		mlog_errno(ret);
-		goto out;
-	}
 
 out:
 	ocfs2_free_path(left_path);
@@ -5654,10 +5648,8 @@ int ocfs2_remove_extent(handle_t *handle,
 
 		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
 					 cpos, len);
-		if (ret) {
+		if (ret)
 			mlog_errno(ret);
-			goto out;
-		}
 	}
 
 out:
@@ -5702,7 +5694,6 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
-			goto out;
 		}
 	}
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1296f78ae966..7d9eea7d4a87 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -872,8 +872,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
 				"for type %u key %08x\n", msg_type, key);
 	}
 	write_unlock(&o2net_handler_lock);
-	if (ret)
-		goto out;
 
 out:
 	if (ret)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ddc3e9470c87..79279240fb6e 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -637,10 +637,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 		handle = NULL;
 
 		status = ocfs2_commit_truncate(osb, inode, fe_bh);
-		if (status < 0) {
+		if (status < 0)
 			mlog_errno(status);
-			goto out;
-		}
 	}
 
 out:
@@ -1499,7 +1497,6 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
 		     (unsigned long long)bh->b_blocknr,
 		     le32_to_cpu(di->i_fs_generation));
 		rc = -OCFS2_FILECHECK_ERR_GENERATION;
-		goto bail;
 	}
 
 bail:
-- 
cgit 


From 480bd56485b77c36e17a411921266c6f06623d98 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 17 Aug 2018 15:44:31 -0700
Subject: ocfs2: make several functions and variables static (and some const)

There are a variety of functions and variables that are local to the
source and do not need to be in global scope, so make them static.  Also
make a couple of char arrays static const.

Cleans up sparse warnings:
  symbol 'o2hb_heartbeat_mode_desc' was not declared. Should it be static?
  symbol 'o2hb_heartbeat_mode' was not declared. Should it be static?
  symbol 'o2hb_dependent_users' was not declared. Should it be static?
  symbol 'o2hb_region_dec_user' was not declared. Should it be static?
  symbol 'o2nm_fence_method_desc' was not declared. Should it be static?
  symbol 'lockdep_keys' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/20180628131659.12133-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c   | 12 ++++++------
 fs/ocfs2/cluster/nodemanager.c |  6 +++---
 fs/ocfs2/dlmglue.c             |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ea8c551bcd7e..9b2ed62dd638 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -127,13 +127,13 @@ enum o2hb_heartbeat_modes {
 	O2HB_HEARTBEAT_NUM_MODES,
 };
 
-char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
-		"local",	/* O2HB_HEARTBEAT_LOCAL */
-		"global",	/* O2HB_HEARTBEAT_GLOBAL */
+static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+	"local",	/* O2HB_HEARTBEAT_LOCAL */
+	"global",	/* O2HB_HEARTBEAT_GLOBAL */
 };
 
 unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
-unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
 
 /*
  * o2hb_dependent_users tracks the number of registered callbacks that depend
@@ -141,7 +141,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
  * However only o2dlm depends on the heartbeat. It does not want the heartbeat
  * to stop while a dlm domain is still active.
  */
-unsigned int o2hb_dependent_users;
+static unsigned int o2hb_dependent_users;
 
 /*
  * In global heartbeat mode, all regions are pinned if there are one or more
@@ -2486,7 +2486,7 @@ unlock:
 	return ret;
 }
 
-void o2hb_region_dec_user(const char *region_uuid)
+static void o2hb_region_dec_user(const char *region_uuid)
 {
 	spin_lock(&o2hb_live_lock);
 
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index da64c3a20eeb..0e4166cc23a0 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -35,9 +35,9 @@
  * cluster references throughout where nodes are looked up */
 struct o2nm_cluster *o2nm_single_cluster = NULL;
 
-char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
-		"reset",	/* O2NM_FENCE_RESET */
-		"panic",	/* O2NM_FENCE_PANIC */
+static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+	"reset",	/* O2NM_FENCE_RESET */
+	"panic",	/* O2NM_FENCE_PANIC */
 };
 
 static inline void o2nm_lock_subsystem(void);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0ff424c6d17c..8e712b614e6e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -96,7 +96,7 @@ struct ocfs2_unblock_ctl {
 };
 
 /* Lockdep class keys */
-struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
 
 static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
 					int new_level);
-- 
cgit 


From 6cd00a01f0c1ae6a852b09c59b8dd55cc6c35d1d Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 17 Aug 2018 15:44:34 -0700
Subject: fs/dcache.c: fix kmemcheck splat at take_dentry_name_snapshot()

Since only dentry->d_name.len + 1 bytes out of DNAME_INLINE_LEN bytes
are initialized at __d_alloc(), we can't copy the whole size
unconditionally.

 WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff8fa27465ac50)
 636f6e66696766732e746d70000000000010000000000000020000000188ffff
  i i i i i i i i i i i i i u u u u u u u u u u i i i i i u u u u
                                  ^
 RIP: 0010:take_dentry_name_snapshot+0x28/0x50
 RSP: 0018:ffffa83000f5bdf8 EFLAGS: 00010246
 RAX: 0000000000000020 RBX: ffff8fa274b20550 RCX: 0000000000000002
 RDX: ffffa83000f5be40 RSI: ffff8fa27465ac50 RDI: ffffa83000f5be60
 RBP: ffffa83000f5bdf8 R08: ffffa83000f5be48 R09: 0000000000000001
 R10: ffff8fa27465ac00 R11: ffff8fa27465acc0 R12: ffff8fa27465ac00
 R13: ffff8fa27465acc0 R14: 0000000000000000 R15: 0000000000000000
 FS:  00007f79737ac8c0(0000) GS:ffffffff8fc30000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffff8fa274c0b000 CR3: 0000000134aa7002 CR4: 00000000000606f0
  take_dentry_name_snapshot+0x28/0x50
  vfs_rename+0x128/0x870
  SyS_rename+0x3b2/0x3d0
  entry_SYSCALL_64_fastpath+0x1a/0xa4
  0xffffffffffffffff

Link: http://lkml.kernel.org/r/201709131912.GBG39012.QMJLOVFSFFOOtH@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 8d2ec4898c2b..2e7e8d85e9b4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry
 		spin_unlock(&dentry->d_lock);
 		name->name = p->name;
 	} else {
-		memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN);
+		memcpy(name->inline_name, dentry->d_iname,
+		       dentry->d_name.len + 1);
 		spin_unlock(&dentry->d_lock);
 		name->name = name->inline_name;
 	}
-- 
cgit 


From 4cdfffc8722e99be8d400d8fa1fcd615d078ad43 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 17 Aug 2018 15:44:37 -0700
Subject: vfs: discard ATTR_ATTR_FLAG

This flag was introduce in 2.1.37pre1 and the only place it was tested
was removed in 2.1.43pre1.  The flag was never set.

Let's discard it properly.

Link: http://lkml.kernel.org/r/877en0hewz.fsf@notabene.neil.brown.name
Signed-off-by: NeilBrown <neilb@suse.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index cb8374af08a6..33b8423ef0c9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -19,7 +19,7 @@
 #define HOSTFS_ATTR_ATIME_SET	128
 #define HOSTFS_ATTR_MTIME_SET	256
 
-/* These two are unused by hostfs. */
+/* This one is unused by hostfs. */
 #define HOSTFS_ATTR_FORCE	512	/* Not a change, but a change it */
 #define HOSTFS_ATTR_ATTR_FLAG	1024
 
-- 
cgit 


From 1f4aace60b0edc2d885aaa263abf4df42c8c65a8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 17 Aug 2018 15:44:41 -0700
Subject: fs/seq_file.c: simplify seq_file iteration code and interface

The documentation for seq_file suggests that it is necessary to be able
to move the iterator to a given offset, however that is not the case.
If the iterator is stored in the private data and is stable from one
read() syscall to the next, it is only necessary to support first/next
interactions.  Implementing this in a client is a little clumsy.

 - if ->start() is given a pos of zero, it should go to start of
   sequence.

 - if ->start() is given the name pos that was given to the most recent
   next() or start(), it should restore the iterator to state just
   before that last call

 - if ->start is given another number, it should set the iterator one
   beyond the start just before the last ->start or ->next call.

Also, the documentation says that the implementation can interpret the
pos however it likes (other than zero meaning start), but seq_file
increments the pos sometimes which does impose on the implementation.

This patch simplifies the interface for first/next iteration and
simplifies the code, while maintaining complete backward compatability.
Now:

 - if ->start() is given a pos of zero, it should return an iterator
   placed at the start of the sequence

 - if ->start() is given a non-zero pos, it should return the iterator
   in the same state it was after the last ->start or ->next.

This is particularly useful for interators which walk the multiple
chains in a hash table, e.g.  using rhashtable_walk*.  See
fs/gfs2/glock.c and drivers/staging/lustre/lustre/llite/vvp_dev.c

A large part of achieving this is to *always* call ->next after ->show
has successfully stored all of an entry in the buffer.  Never just
increment the index instead.  Also:

 - always pass &m->index to ->start() and ->next(), never a temp
   variable

 - don't clear ->from when ->count is zero, as ->from is dead when
   ->count is zero.

Some ->next functions do not increment *pos when they return NULL.  To
maintain compatability with this, we still need to increment m->index in
one place, if ->next didn't increment it.  Note that such ->next
functions are buggy and should be fixed.  A simple demonstration is

   dd if=/proc/swaps bs=1000 skip=1

Choose any block size larger than the size of /proc/swaps.  This will
always show the whole last line of /proc/swaps.

This patch doesn't work around buggy next() functions for this case.

[neilb@suse.com: ensure ->from is valid]
  Link: http://lkml.kernel.org/r/87601ryb8a.fsf@notabene.neil.brown.name
Signed-off-by: NeilBrown <neilb@suse.com>
Acked-by: Jonathan Corbet <corbet@lwn.net>	[docs]
Tested-by: Jann Horn <jannh@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 54 +++++++++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4cc090b50cc5..1dea7a8a5255 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -90,23 +90,22 @@ EXPORT_SYMBOL(seq_open);
 
 static int traverse(struct seq_file *m, loff_t offset)
 {
-	loff_t pos = 0, index;
+	loff_t pos = 0;
 	int error = 0;
 	void *p;
 
 	m->version = 0;
-	index = 0;
+	m->index = 0;
 	m->count = m->from = 0;
-	if (!offset) {
-		m->index = index;
+	if (!offset)
 		return 0;
-	}
+
 	if (!m->buf) {
 		m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
 		if (!m->buf)
 			return -ENOMEM;
 	}
-	p = m->op->start(m, &index);
+	p = m->op->start(m, &m->index);
 	while (p) {
 		error = PTR_ERR(p);
 		if (IS_ERR(p))
@@ -123,20 +122,15 @@ static int traverse(struct seq_file *m, loff_t offset)
 		if (pos + m->count > offset) {
 			m->from = offset - pos;
 			m->count -= m->from;
-			m->index = index;
 			break;
 		}
 		pos += m->count;
 		m->count = 0;
-		if (pos == offset) {
-			index++;
-			m->index = index;
+		p = m->op->next(m, p, &m->index);
+		if (pos == offset)
 			break;
-		}
-		p = m->op->next(m, p, &index);
 	}
 	m->op->stop(m, p);
-	m->index = index;
 	return error;
 
 Eoverflow:
@@ -160,7 +154,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 {
 	struct seq_file *m = file->private_data;
 	size_t copied = 0;
-	loff_t pos;
 	size_t n;
 	void *p;
 	int err = 0;
@@ -223,16 +216,12 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 		size -= n;
 		buf += n;
 		copied += n;
-		if (!m->count) {
-			m->from = 0;
-			m->index++;
-		}
 		if (!size)
 			goto Done;
 	}
 	/* we need at least one record in buffer */
-	pos = m->index;
-	p = m->op->start(m, &pos);
+	m->from = 0;
+	p = m->op->start(m, &m->index);
 	while (1) {
 		err = PTR_ERR(p);
 		if (!p || IS_ERR(p))
@@ -243,8 +232,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 		if (unlikely(err))
 			m->count = 0;
 		if (unlikely(!m->count)) {
-			p = m->op->next(m, p, &pos);
-			m->index = pos;
+			p = m->op->next(m, p, &m->index);
 			continue;
 		}
 		if (m->count < m->size)
@@ -256,29 +244,33 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 		if (!m->buf)
 			goto Enomem;
 		m->version = 0;
-		pos = m->index;
-		p = m->op->start(m, &pos);
+		p = m->op->start(m, &m->index);
 	}
 	m->op->stop(m, p);
 	m->count = 0;
 	goto Done;
 Fill:
 	/* they want more? let's try to get some more */
-	while (m->count < size) {
+	while (1) {
 		size_t offs = m->count;
-		loff_t next = pos;
-		p = m->op->next(m, p, &next);
+		loff_t pos = m->index;
+
+		p = m->op->next(m, p, &m->index);
+		if (pos == m->index)
+			/* Buggy ->next function */
+			m->index++;
 		if (!p || IS_ERR(p)) {
 			err = PTR_ERR(p);
 			break;
 		}
+		if (m->count >= size)
+			break;
 		err = m->op->show(m, p);
 		if (seq_has_overflowed(m) || err) {
 			m->count = offs;
 			if (likely(err <= 0))
 				break;
 		}
-		pos = next;
 	}
 	m->op->stop(m, p);
 	n = min(m->count, size);
@@ -287,11 +279,7 @@ Fill:
 		goto Efault;
 	copied += n;
 	m->count -= n;
-	if (m->count)
-		m->from = n;
-	else
-		pos++;
-	m->index = pos;
+	m->from = n;
 Done:
 	if (!copied)
 		copied = err;
-- 
cgit 


From 357c1206520da7a40e383fe329ce379bda722cd9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 17 Aug 2018 15:45:32 -0700
Subject: mpage: add argument structure for do_mpage_readpage()

Patch series "Submit ->readpages() IO as read-ahead", v4.

The only caller of ->readpages() is from read-ahead, yet we don't submit
IO flagged with REQ_RAHEAD.  This means we don't see it in blktrace, for
instance, which is a shame.  Additionally, it's preventing further
functional changes in the block layer for deadling with read-ahead more
intelligently.  We already make assumptions about ->readpages() just
being for read-ahead in the mpage implementation, using
readahead_gfp_mask(mapping) as out GFP mask of choice.

This small series fixes up mpage_readpages() to submit with REQ_RAHEAD,
which takes care of file systems using mpage_readpages().  The first
patch is a prep patch, that makes do_mpage_readpage() take an argument
structure.

This patch (of 4):

We're currently passing 8 arguments to this function, clean it up a bit
by packing the arguments in an args structure we pass to it.

No intentional functional changes in this patch.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20180621010725.17813-2-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/mpage.c | 109 ++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 57 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/mpage.c b/fs/mpage.c
index b73638db9866..6dc90e456abf 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -133,6 +133,17 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
 	} while (page_bh != head);
 }
 
+struct mpage_readpage_args {
+	struct bio *bio;
+	struct page *page;
+	unsigned int nr_pages;
+	sector_t last_block_in_bio;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block;
+	get_block_t *get_block;
+	gfp_t gfp;
+};
+
 /*
  * This is the worker routine which does all the work of mapping the disk
  * blocks and constructs largest possible bios, submits them for IO if the
@@ -142,16 +153,14 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
  * represent the validity of its disk mapping and to decide when to do the next
  * get_block() call.
  */
-static struct bio *
-do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
-		sector_t *last_block_in_bio, struct buffer_head *map_bh,
-		unsigned long *first_logical_block, get_block_t get_block,
-		gfp_t gfp)
+static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 {
+	struct page *page = args->page;
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	const unsigned blocksize = 1 << blkbits;
+	struct buffer_head *map_bh = &args->map_bh;
 	sector_t block_in_file;
 	sector_t last_block;
 	sector_t last_block_in_file;
@@ -168,7 +177,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 		goto confused;
 
 	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
-	last_block = block_in_file + nr_pages * blocks_per_page;
+	last_block = block_in_file + args->nr_pages * blocks_per_page;
 	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
 	if (last_block > last_block_in_file)
 		last_block = last_block_in_file;
@@ -178,9 +187,10 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	 * Map blocks using the result from the previous get_blocks call first.
 	 */
 	nblocks = map_bh->b_size >> blkbits;
-	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
-			block_in_file < (*first_logical_block + nblocks)) {
-		unsigned map_offset = block_in_file - *first_logical_block;
+	if (buffer_mapped(map_bh) &&
+			block_in_file > args->first_logical_block &&
+			block_in_file < (args->first_logical_block + nblocks)) {
+		unsigned map_offset = block_in_file - args->first_logical_block;
 		unsigned last = nblocks - map_offset;
 
 		for (relative_block = 0; ; relative_block++) {
@@ -208,9 +218,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 
 		if (block_in_file < last_block) {
 			map_bh->b_size = (last_block-block_in_file) << blkbits;
-			if (get_block(inode, block_in_file, map_bh, 0))
+			if (args->get_block(inode, block_in_file, map_bh, 0))
 				goto confused;
-			*first_logical_block = block_in_file;
+			args->first_logical_block = block_in_file;
 		}
 
 		if (!buffer_mapped(map_bh)) {
@@ -273,43 +283,45 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
-	if (bio && (*last_block_in_bio != blocks[0] - 1))
-		bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+	if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
+		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
 
 alloc_new:
-	if (bio == NULL) {
+	if (args->bio == NULL) {
 		if (first_hole == blocks_per_page) {
 			if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
 								page))
 				goto out;
 		}
-		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-				min_t(int, nr_pages, BIO_MAX_PAGES), gfp);
-		if (bio == NULL)
+		args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+					min_t(int, args->nr_pages,
+					      BIO_MAX_PAGES),
+					args->gfp);
+		if (args->bio == NULL)
 			goto confused;
 	}
 
 	length = first_hole << blkbits;
-	if (bio_add_page(bio, page, length, 0) < length) {
-		bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+	if (bio_add_page(args->bio, page, length, 0) < length) {
+		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
 		goto alloc_new;
 	}
 
-	relative_block = block_in_file - *first_logical_block;
+	relative_block = block_in_file - args->first_logical_block;
 	nblocks = map_bh->b_size >> blkbits;
 	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
 	    (first_hole != blocks_per_page))
-		bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
 	else
-		*last_block_in_bio = blocks[blocks_per_page - 1];
+		args->last_block_in_bio = blocks[blocks_per_page - 1];
 out:
-	return bio;
+	return args->bio;
 
 confused:
-	if (bio)
-		bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+	if (args->bio)
+		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
 	if (!PageUptodate(page))
-	        block_read_full_page(page, get_block);
+		block_read_full_page(page, args->get_block);
 	else
 		unlock_page(page);
 	goto out;
@@ -363,15 +375,12 @@ int
 mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block)
 {
-	struct bio *bio = NULL;
+	struct mpage_readpage_args args = {
+		.get_block = get_block,
+		.gfp = readahead_gfp_mask(mapping),
+	};
 	unsigned page_idx;
-	sector_t last_block_in_bio = 0;
-	struct buffer_head map_bh;
-	unsigned long first_logical_block = 0;
-	gfp_t gfp = readahead_gfp_mask(mapping);
 
-	map_bh.b_state = 0;
-	map_bh.b_size = 0;
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = lru_to_page(pages);
 
@@ -379,18 +388,16 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 		list_del(&page->lru);
 		if (!add_to_page_cache_lru(page, mapping,
 					page->index,
-					gfp)) {
-			bio = do_mpage_readpage(bio, page,
-					nr_pages - page_idx,
-					&last_block_in_bio, &map_bh,
-					&first_logical_block,
-					get_block, gfp);
+					args.gfp)) {
+			args.page = page;
+			args.nr_pages = nr_pages - page_idx;
+			args.bio = do_mpage_readpage(&args);
 		}
 		put_page(page);
 	}
 	BUG_ON(!list_empty(pages));
-	if (bio)
-		mpage_bio_submit(REQ_OP_READ, 0, bio);
+	if (args.bio)
+		mpage_bio_submit(REQ_OP_READ, 0, args.bio);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
@@ -400,18 +407,16 @@ EXPORT_SYMBOL(mpage_readpages);
  */
 int mpage_readpage(struct page *page, get_block_t get_block)
 {
-	struct bio *bio = NULL;
-	sector_t last_block_in_bio = 0;
-	struct buffer_head map_bh;
-	unsigned long first_logical_block = 0;
-	gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+	struct mpage_readpage_args args = {
+		.page = page,
+		.nr_pages = 1,
+		.get_block = get_block,
+		.gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL),
+	};
 
-	map_bh.b_state = 0;
-	map_bh.b_size = 0;
-	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
-			&map_bh, &first_logical_block, get_block, gfp);
-	if (bio)
-		mpage_bio_submit(REQ_OP_READ, 0, bio);
+	args.bio = do_mpage_readpage(&args);
+	if (args.bio)
+		mpage_bio_submit(REQ_OP_READ, 0, args.bio);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpage);
-- 
cgit 


From 74c8164e1cdb1eb22f1d49d54e515e81821a8ad0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 17 Aug 2018 15:45:36 -0700
Subject: mpage: mpage_readpages() should submit IO as read-ahead

a_ops->readpages() is only ever used for read-ahead, yet we don't flag
the IO being submitted as such.  Fix that up.  Any file system that uses
mpage_readpages() as its ->readpages() implementation will now get this
right.

Since we're passing in whether the IO is read-ahead or not, we don't
need to pass in the 'gfp' separately, as it is dependent on the IO being
read-ahead.  Kill off that member.

Add some documentation notes on ->readpages() being purely for
read-ahead.

Link: http://lkml.kernel.org/r/20180621010725.17813-3-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <clm@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/f2fs/data.c |  5 +++++
 fs/mpage.c     | 29 +++++++++++++++++++----------
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8f931d699287..b7c9b58acf3e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1421,6 +1421,11 @@ out:
 /*
  * This function was originally taken from fs/mpage.c, and customized for f2fs.
  * Major change was from block_size == page_size in f2fs by default.
+ *
+ * Note that the aops->readpages() function is ONLY used for read-ahead. If
+ * this function ever deviates from doing just read-ahead, it should either
+ * use ->readpage() or do the necessary surgery to decouple ->readpages()
+ * readom read-ahead.
  */
 static int f2fs_mpage_readpages(struct address_space *mapping,
 			struct list_head *pages, struct page *page,
diff --git a/fs/mpage.c b/fs/mpage.c
index 6dc90e456abf..c820dc9bebab 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -137,11 +137,11 @@ struct mpage_readpage_args {
 	struct bio *bio;
 	struct page *page;
 	unsigned int nr_pages;
+	bool is_readahead;
 	sector_t last_block_in_bio;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block;
 	get_block_t *get_block;
-	gfp_t gfp;
 };
 
 /*
@@ -170,8 +170,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	struct block_device *bdev = NULL;
 	int length;
 	int fully_mapped = 1;
+	int op_flags;
 	unsigned nblocks;
 	unsigned relative_block;
+	gfp_t gfp;
+
+	if (args->is_readahead) {
+		op_flags = REQ_RAHEAD;
+		gfp = readahead_gfp_mask(page->mapping);
+	} else {
+		op_flags = 0;
+		gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+	}
 
 	if (page_has_buffers(page))
 		goto confused;
@@ -284,7 +294,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
 	if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
-		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
+		args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
 
 alloc_new:
 	if (args->bio == NULL) {
@@ -296,14 +306,14 @@ alloc_new:
 		args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
 					min_t(int, args->nr_pages,
 					      BIO_MAX_PAGES),
-					args->gfp);
+					gfp);
 		if (args->bio == NULL)
 			goto confused;
 	}
 
 	length = first_hole << blkbits;
 	if (bio_add_page(args->bio, page, length, 0) < length) {
-		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
+		args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
 		goto alloc_new;
 	}
 
@@ -311,7 +321,7 @@ alloc_new:
 	nblocks = map_bh->b_size >> blkbits;
 	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
 	    (first_hole != blocks_per_page))
-		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
+		args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
 	else
 		args->last_block_in_bio = blocks[blocks_per_page - 1];
 out:
@@ -319,7 +329,7 @@ out:
 
 confused:
 	if (args->bio)
-		args->bio = mpage_bio_submit(REQ_OP_READ, 0, args->bio);
+		args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
 	if (!PageUptodate(page))
 		block_read_full_page(page, args->get_block);
 	else
@@ -377,7 +387,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 {
 	struct mpage_readpage_args args = {
 		.get_block = get_block,
-		.gfp = readahead_gfp_mask(mapping),
+		.is_readahead = true,
 	};
 	unsigned page_idx;
 
@@ -388,7 +398,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 		list_del(&page->lru);
 		if (!add_to_page_cache_lru(page, mapping,
 					page->index,
-					args.gfp)) {
+					readahead_gfp_mask(mapping))) {
 			args.page = page;
 			args.nr_pages = nr_pages - page_idx;
 			args.bio = do_mpage_readpage(&args);
@@ -397,7 +407,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	}
 	BUG_ON(!list_empty(pages));
 	if (args.bio)
-		mpage_bio_submit(REQ_OP_READ, 0, args.bio);
+		mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
@@ -411,7 +421,6 @@ int mpage_readpage(struct page *page, get_block_t get_block)
 		.page = page,
 		.nr_pages = 1,
 		.get_block = get_block,
-		.gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL),
 	};
 
 	args.bio = do_mpage_readpage(&args);
-- 
cgit 


From 5e9d398240b2292b1091f921d29bbab374b755fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 17 Aug 2018 15:45:39 -0700
Subject: btrfs: readpages() should submit IO as read-ahead

a_ops->readpages() is only ever used for read-ahead.  Ensure that we
pass this information down to the block layer.

Link: http://lkml.kernel.org/r/20180621010725.17813-4-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <clm@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 628f1aef34b0..4dd6faab02bb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3102,7 +3102,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 
 	for (index = 0; index < nr_pages; index++) {
 		__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
-				bio, 0, bio_flags, 0, prev_em_start);
+				bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
 		put_page(pages[index]);
 	}
 }
-- 
cgit 


From ac22b46a0b65dbeccbf4d458db95687e825bde90 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 17 Aug 2018 15:45:42 -0700
Subject: ext4: readpages() should submit IO as read-ahead

a_ops->readpages() is only ever used for read-ahead.  Ensure that we
pass this information down to the block layer.

Link: http://lkml.kernel.org/r/20180621010725.17813-5-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <clm@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/ext4.h     | 2 +-
 fs/ext4/inode.c    | 5 +++--
 fs/ext4/readpage.c | 5 +++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1fc013f3d944..0f0edd1cd0cd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3062,7 +3062,7 @@ static inline void ext4_set_de_type(struct super_block *sb,
 /* readpages.c */
 extern int ext4_mpage_readpages(struct address_space *mapping,
 				struct list_head *pages, struct page *page,
-				unsigned nr_pages);
+				unsigned nr_pages, bool is_readahead);
 
 /* symlink.c */
 extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8f6ad7667974..d0dd585add6a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3325,7 +3325,8 @@ static int ext4_readpage(struct file *file, struct page *page)
 		ret = ext4_readpage_inline(inode, page);
 
 	if (ret == -EAGAIN)
-		return ext4_mpage_readpages(page->mapping, NULL, page, 1);
+		return ext4_mpage_readpages(page->mapping, NULL, page, 1,
+						false);
 
 	return ret;
 }
@@ -3340,7 +3341,7 @@ ext4_readpages(struct file *file, struct address_space *mapping,
 	if (ext4_has_inline_data(inode))
 		return 0;
 
-	return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
+	return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned int offset,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 19b87a8de6ff..f461d75ac049 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -98,7 +98,7 @@ static void mpage_end_io(struct bio *bio)
 
 int ext4_mpage_readpages(struct address_space *mapping,
 			 struct list_head *pages, struct page *page,
-			 unsigned nr_pages)
+			 unsigned nr_pages, bool is_readahead)
 {
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
@@ -259,7 +259,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 			bio->bi_end_io = mpage_end_io;
 			bio->bi_private = ctx;
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+			bio_set_op_attrs(bio, REQ_OP_READ,
+						is_readahead ? REQ_RAHEAD : 0);
 		}
 
 		length = first_hole << blkbits;
-- 
cgit 


From d46eb14b735b11927d4bdc2d1854c311af19de6d Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Fri, 17 Aug 2018 15:46:39 -0700
Subject: fs: fsnotify: account fsnotify metadata to kmemcg

Patch series "Directed kmem charging", v8.

The Linux kernel's memory cgroup allows limiting the memory usage of the
jobs running on the system to provide isolation between the jobs.  All
the kernel memory allocated in the context of the job and marked with
__GFP_ACCOUNT will also be included in the memory usage and be limited
by the job's limit.

The kernel memory can only be charged to the memcg of the process in
whose context kernel memory was allocated.  However there are cases
where the allocated kernel memory should be charged to the memcg
different from the current processes's memcg.  This patch series
contains two such concrete use-cases i.e.  fsnotify and buffer_head.

The fsnotify event objects can consume a lot of system memory for large
or unlimited queues if there is either no or slow listener.  The events
are allocated in the context of the event producer.  However they should
be charged to the event consumer.  Similarly the buffer_head objects can
be allocated in a memcg different from the memcg of the page for which
buffer_head objects are being allocated.

To solve this issue, this patch series introduces mechanism to charge
kernel memory to a given memcg.  In case of fsnotify events, the memcg
of the consumer can be used for charging and for buffer_head, the memcg
of the page can be charged.  For directed charging, the caller can use
the scope API memalloc_[un]use_memcg() to specify the memcg to charge
for all the __GFP_ACCOUNT allocations within the scope.

This patch (of 2):

A lot of memory can be consumed by the events generated for the huge or
unlimited queues if there is either no or slow listener.  This can cause
system level memory pressure or OOMs.  So, it's better to account the
fsnotify kmem caches to the memcg of the listener.

However the listener can be in a different memcg than the memcg of the
producer and these allocations happen in the context of the event
producer.  This patch introduces remote memcg charging API which the
producer can use to charge the allocations to the memcg of the listener.

There are seven fsnotify kmem caches and among them allocations from
dnotify_struct_cache, dnotify_mark_cache, fanotify_mark_cache and
inotify_inode_mark_cachep happens in the context of syscall from the
listener.  So, SLAB_ACCOUNT is enough for these caches.

The objects from fsnotify_mark_connector_cachep are not accounted as
they are small compared to the notification mark or events and it is
unclear whom to account connector to since it is shared by all events
attached to the inode.

The allocations from the event caches happen in the context of the event
producer.  For such caches we will need to remote charge the allocations
to the listener's memcg.  Thus we save the memcg reference in the
fsnotify_group structure of the listener.

This patch has also moved the members of fsnotify_group to keep the size
same, at least for 64 bit build, even with additional member by filling
the holes.

[shakeelb@google.com: use GFP_KERNEL_ACCOUNT rather than open-coding it]
  Link: http://lkml.kernel.org/r/20180702215439.211597-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/dnotify/dnotify.c          |  5 +++--
 fs/notify/fanotify/fanotify.c        | 14 ++++++++++----
 fs/notify/fanotify/fanotify_user.c   |  5 ++++-
 fs/notify/group.c                    |  3 +++
 fs/notify/inotify/inotify_fsnotify.c |  7 ++++++-
 fs/notify/inotify/inotify_user.c     |  5 ++++-
 6 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index e2bea2ac5dfb..a6365e6bc047 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -384,8 +384,9 @@ out_err:
 
 static int __init dnotify_init(void)
 {
-	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-	dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
+	dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
+					  SLAB_PANIC|SLAB_ACCOUNT);
+	dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
 
 	dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
 	if (IS_ERR(dnotify_group))
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f90842efea13..eb4e75175cfb 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 #include <linux/audit.h>
+#include <linux/sched/mm.h>
 
 #include "fanotify.h"
 
@@ -140,8 +141,8 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
 						 struct inode *inode, u32 mask,
 						 const struct path *path)
 {
-	struct fanotify_event_info *event;
-	gfp_t gfp = GFP_KERNEL;
+	struct fanotify_event_info *event = NULL;
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
 
 	/*
 	 * For queues with unlimited length lost events are not expected and
@@ -151,19 +152,22 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
 	if (group->max_events == UINT_MAX)
 		gfp |= __GFP_NOFAIL;
 
+	/* Whoever is interested in the event, pays for the allocation. */
+	memalloc_use_memcg(group->memcg);
+
 	if (fanotify_is_perm_event(mask)) {
 		struct fanotify_perm_event_info *pevent;
 
 		pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
 		if (!pevent)
-			return NULL;
+			goto out;
 		event = &pevent->fae;
 		pevent->response = 0;
 		goto init;
 	}
 	event = kmem_cache_alloc(fanotify_event_cachep, gfp);
 	if (!event)
-		return NULL;
+		goto out;
 init: __maybe_unused
 	fsnotify_init_event(&event->fse, inode, mask);
 	event->tgid = get_pid(task_tgid(current));
@@ -174,6 +178,8 @@ init: __maybe_unused
 		event->path.mnt = NULL;
 		event->path.dentry = NULL;
 	}
+out:
+	memalloc_unuse_memcg();
 	return event;
 }
 
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ec4d8c59d0e3..0cf45041dc32 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
+#include <linux/memcontrol.h>
 
 #include <asm/ioctls.h>
 
@@ -756,6 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 
 	group->fanotify_data.user = user;
 	atomic_inc(&user->fanotify_listeners);
+	group->memcg = get_mem_cgroup_from_mm(current->mm);
 
 	oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
 	if (unlikely(!oevent)) {
@@ -957,7 +959,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
  */
 static int __init fanotify_user_setup(void)
 {
-	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+					 SLAB_PANIC|SLAB_ACCOUNT);
 	fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
 		fanotify_perm_event_cachep =
diff --git a/fs/notify/group.c b/fs/notify/group.c
index aa5468f23e45..c03b83662876 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,6 +22,7 @@
 #include <linux/srcu.h>
 #include <linux/rculist.h>
 #include <linux/wait.h>
+#include <linux/memcontrol.h>
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
@@ -36,6 +37,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
 
+	mem_cgroup_put(group->memcg);
+
 	kfree(group);
 }
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 9ab6dde38a14..f4184b4f3815 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -31,6 +31,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/sched/user.h>
+#include <linux/sched/mm.h>
 
 #include "inotify.h"
 
@@ -98,7 +99,11 @@ int inotify_handle_event(struct fsnotify_group *group,
 	i_mark = container_of(inode_mark, struct inotify_inode_mark,
 			      fsn_mark);
 
-	event = kmalloc(alloc_len, GFP_KERNEL);
+	/* Whoever is interested in the event, pays for the allocation. */
+	memalloc_use_memcg(group->memcg);
+	event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT);
+	memalloc_unuse_memcg();
+
 	if (unlikely(!event)) {
 		/*
 		 * Treat lost event due to ENOMEM the same way as queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1cf5b779d862..749c46ababa0 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -38,6 +38,7 @@
 #include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <linux/wait.h>
+#include <linux/memcontrol.h>
 
 #include "inotify.h"
 #include "../fdinfo.h"
@@ -636,6 +637,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 	oevent->name_len = 0;
 
 	group->max_events = max_events;
+	group->memcg = get_mem_cgroup_from_mm(current->mm);
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
@@ -808,7 +810,8 @@ static int __init inotify_user_setup(void)
 
 	BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
 
-	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
+	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
+					       SLAB_PANIC|SLAB_ACCOUNT);
 
 	inotify_max_queued_events = 16384;
 	init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
-- 
cgit 


From f745c6f5fe75734f3b35d9d4e6ebe2a7d010ddda Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Fri, 17 Aug 2018 15:46:44 -0700
Subject: fs, mm: account buffer_head to kmemcg

The buffer_head can consume a significant amount of system memory and is
directly related to the amount of page cache.  In our production
environment we have observed that a lot of machines are spending a
significant amount of memory as buffer_head and can not be left as
system memory overhead.

Charging buffer_head is not as simple as adding __GFP_ACCOUNT to the
allocation.  The buffer_heads can be allocated in a memcg different from
the memcg of the page for which buffer_heads are being allocated.  One
concrete example is memory reclaim.  The reclaim can trigger I/O of
pages of any memcg on the system.  So, the right way to charge
buffer_head is to extract the memcg from the page for which buffer_heads
are being allocated and then use targeted memcg charging API.

[shakeelb@google.com: use __GFP_ACCOUNT for directed memcg charging]
  Link: http://lkml.kernel.org/r/20180702220208.213380-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20180627191250.209150-3-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index c8c2b7d8b8d6..4cc679d5bf58 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -45,6 +45,7 @@
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
 #include <linux/pagevec.h>
+#include <linux/sched/mm.h>
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -813,12 +814,16 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		bool retry)
 {
 	struct buffer_head *bh, *head;
-	gfp_t gfp = GFP_NOFS;
+	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
 	long offset;
+	struct mem_cgroup *memcg;
 
 	if (retry)
 		gfp |= __GFP_NOFAIL;
 
+	memcg = get_mem_cgroup_from_page(page);
+	memalloc_use_memcg(memcg);
+
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
@@ -835,6 +840,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 		/* Link the buffer to its page */
 		set_bh_page(bh, page, offset);
 	}
+out:
+	memalloc_unuse_memcg();
+	mem_cgroup_put(memcg);
 	return head;
 /*
  * In case anything failed, we just free everything we got.
@@ -848,7 +856,7 @@ no_grow:
 		} while (head);
 	}
 
-	return NULL;
+	goto out;
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
-- 
cgit 


From 2b3648a6ff83bd2a59b427d3537cc570933659b5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 17 Aug 2018 15:47:45 -0700
Subject: fs/super.c: refactor alloc_super()

Do two list_lru_init_memcg() calls after prealloc_super().
destroy_unused_super() in fail path is OK with this.  Next patch needs
such the order.

Link: http://lkml.kernel.org/r/153063058712.1818.3382490999719078571.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 50728d9c1a05..78227c4ddb21 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -244,10 +244,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	INIT_LIST_HEAD(&s->s_inodes_wb);
 	spin_lock_init(&s->s_inode_wblist_lock);
 
-	if (list_lru_init_memcg(&s->s_dentry_lru))
-		goto fail;
-	if (list_lru_init_memcg(&s->s_inode_lru))
-		goto fail;
 	s->s_count = 1;
 	atomic_set(&s->s_active, 1);
 	mutex_init(&s->s_vfs_rename_mutex);
@@ -265,6 +261,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
 	if (prealloc_shrinker(&s->s_shrink))
 		goto fail;
+	if (list_lru_init_memcg(&s->s_dentry_lru))
+		goto fail;
+	if (list_lru_init_memcg(&s->s_inode_lru))
+		goto fail;
 	return s;
 
 fail:
-- 
cgit 


From c92e8e10cafeaaedc84f23fed1bfcf9cf07399c2 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 17 Aug 2018 15:47:50 -0700
Subject: fs: propagate shrinker::id to list_lru

Add list_lru::shrinker_id field and populate it by registered shrinker
id.

This will be used to set correct bit in memcg shrinkers map by lru code
in next patches, after there appeared the first related to memcg element
in list_lru.

Link: http://lkml.kernel.org/r/153063059758.1818.14866596416857717800.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 78227c4ddb21..f5f96e52e0cd 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -261,9 +261,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
 	if (prealloc_shrinker(&s->s_shrink))
 		goto fail;
-	if (list_lru_init_memcg(&s->s_dentry_lru))
+	if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
 		goto fail;
-	if (list_lru_init_memcg(&s->s_inode_lru))
+	if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
 		goto fail;
 	return s;
 
-- 
cgit 


From 9b996468cfdba09f688f52dba4287de596194613 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 17 Aug 2018 15:48:21 -0700
Subject: mm: add SHRINK_EMPTY shrinker methods return value

We need to distinguish the situations when shrinker has very small
amount of objects (see vfs_pressure_ratio() called from
super_cache_count()), and when it has no objects at all.  Currently, in
the both of these cases, shrinker::count_objects() returns 0.

The patch introduces new SHRINK_EMPTY return value, which will be used
for "no objects at all" case.  It's is a refactoring mostly, as
SHRINK_EMPTY is replaced by 0 by all callers of do_shrink_slab() in this
patch, and all the magic will happen in further.

Link: http://lkml.kernel.org/r/153063069574.1818.11037751256699341813.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Tested-by: Shakeel Butt <shakeelb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Sahitya Tummala <stummala@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index f5f96e52e0cd..7429588d6b49 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -144,6 +144,9 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
 	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
 
+	if (!total_objects)
+		return SHRINK_EMPTY;
+
 	total_objects = vfs_pressure_ratio(total_objects);
 	return total_objects;
 }
-- 
cgit 


From 5241d4727479aad77af50b80757c38268bfa4560 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 17 Aug 2018 15:50:01 -0700
Subject: fs/userfaultfd.c: remove redundant pointer uwq

Pointer uwq is being assigned but is never used hence it is redundant
and can be removed.

Cleans up clang warning:
  warning: variable 'uwq' set but not used [-Wunused-but-set-variable]

Link: http://lkml.kernel.org/r/20180717090802.18357-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bad9cea37f12..15c265d450bf 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1849,17 +1849,14 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct userfaultfd_ctx *ctx = f->private_data;
 	wait_queue_entry_t *wq;
-	struct userfaultfd_wait_queue *uwq;
 	unsigned long pending = 0, total = 0;
 
 	spin_lock(&ctx->fault_pending_wqh.lock);
 	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
-		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 		pending++;
 		total++;
 	}
 	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
-		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 		total++;
 	}
 	spin_unlock(&ctx->fault_pending_wqh.lock);
-- 
cgit