From 5b49f64db299d0b3f7c2170088186aa593d0be7d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:40:22 +1100 Subject: vfs: vfs_clone_file_prep_inodes should return EINVAL for a clone from beyond EOF vfs_clone_file_prep_inodes cannot return 0 if it is asked to remap from a zero byte file because that's what btrfs does. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 8a2737f0d61d..260797b01851 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1740,10 +1740,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - /* Are we going all the way to the end? */ isize = i_size_read(inode_in); - if (isize == 0) - return 0; /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { -- cgit From 1383a7ed67490fb00d793e36c7a4d599ff88a64d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:40:31 +1100 Subject: vfs: check file ranges before cloning files Move the file range checks from vfs_clone_file_prep into a separate generic_remap_checks function so that all the checks are collected in a central location. This forms the basis for adding more checks from generic_write_checks that will make cloning's input checking more consistent with write input checking. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 55 +++++++++++++------------------------------------------ 1 file changed, 13 insertions(+), 42 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 260797b01851..d6e8e242a15f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1717,13 +1717,12 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) * Returns: 0 for "nothing to clone", 1 for "something to clone", or * the usual negative error code. */ -int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *len, bool is_dedupe) { - loff_t bs = inode_out->i_sb->s_blocksize; - loff_t blen; - loff_t isize; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; @@ -1740,10 +1739,10 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - isize = i_size_read(inode_in); - /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { + loff_t isize = i_size_read(inode_in); + if (is_dedupe || pos_in == isize) return 0; if (pos_in > isize) @@ -1751,36 +1750,11 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, *len = isize - pos_in; } - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + *len < pos_in || pos_out + *len < pos_out || - pos_in + *len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + *len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + *len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = *len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - } + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + is_dedupe); + if (ret) + return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); @@ -1813,7 +1787,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, return 1; } -EXPORT_SYMBOL(vfs_clone_file_prep_inodes); +EXPORT_SYMBOL(vfs_clone_file_prep); int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) @@ -1851,9 +1825,6 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; - if (pos_in + len > i_size_read(inode_in)) - return -EINVAL; - ret = file_in->f_op->clone_file_range(file_in, pos_in, file_out, pos_out, len); if (!ret) { -- cgit From 2c5773f102c9bb07d5328467f61f0a88f2f2892d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:40:39 +1100 Subject: vfs: exit early from zero length remap operations If a remap caller asks us to remap to the source file's EOF and the source file length leaves us with a zero byte request, exit early. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index d6e8e242a15f..2456da3f8a41 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1748,6 +1748,8 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, if (pos_in > isize) return -EINVAL; *len = isize - pos_in; + if (*len == 0) + return 0; } /* Check that we don't violate system file offset limits. */ -- cgit From 07d19dc9fbe9128378b9e226abe886fd8fd473df Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:40:55 +1100 Subject: vfs: avoid problematic remapping requests into partial EOF block A deduplication data corruption is exposed in XFS and btrfs. It is caused by extending the block match range to include the partial EOF block, but then allowing unknown data beyond EOF to be considered a "match" to data in the destination file because the comparison is only made to the end of the source file. This corrupts the destination file when the source extent is shared with it. The VFS remapping prep functions only support whole block dedupe, but we still need to appear to support whole file dedupe correctly. Hence if the dedupe request includes the last block of the souce file, don't include it in the actual dedupe operation. If the rest of the range dedupes successfully, then reject the entire request. A subsequent patch will enable us to shorten dedupe requests correctly. When reflinking sub-file ranges, a data corruption can occur when the source file range includes a partial EOF block. This shares the unknown data beyond EOF into the second file at a position inside EOF, exposing stale data in the second file. If the reflink request includes the last block of the souce file, only proceed with the reflink operation if it lands at or past the destination file's current EOF. If it lands within the destination file EOF, reject the entire request with -EINVAL and make the caller go the hard way. A subsequent patch will enable us to shorten reflink requests correctly. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 2456da3f8a41..0f0a6efdd502 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1708,6 +1708,34 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For deduplication we always scale down to the previous block because we + * can't meaningfully compare post-EOF contents. + * + * For clone we only link a partial EOF block above the destination file's EOF. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + u64 *len, + bool is_dedupe) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + + if ((*len & blkmask) == 0) + return 0; + + if (is_dedupe) + *len &= ~blkmask; + else if (pos_out + *len < i_size_read(inode_out)) + return -EINVAL; + + return 0; +} /* * Check that the two inodes are eligible for cloning, the ranges make @@ -1787,6 +1815,11 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, return -EBADE; } + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + is_dedupe); + if (ret) + return ret; + return 1; } EXPORT_SYMBOL(vfs_clone_file_prep); -- cgit From 9aae20500d9cd3e7d55d0536d359bdd1c869db89 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:01 +1100 Subject: vfs: skip zero-length dedupe requests Don't bother calling the filesystem for a zero-length dedupe request; we can return zero and exit. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 0f0a6efdd502..f5395d8da741 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2009,6 +2009,11 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, if (!dst_file->f_op->dedupe_file_range) goto out_drop_write; + if (len == 0) { + ret = 0; + goto out_drop_write; + } + ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, dst_file, dst_pos, len); out_drop_write: -- cgit From a83ab01a62e61616ebb8b97f90f568c1214dc10d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:08 +1100 Subject: vfs: rename vfs_clone_file_prep to be more descriptive The vfs_clone_file_prep is a generic function to be called by filesystem implementations only. Rename the prefix to generic_ and make it more clear that it applies to remap operations, not just clones. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index f5395d8da741..aca75a97a695 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1745,9 +1745,9 @@ static int generic_remap_check_len(struct inode *inode_in, * Returns: 0 for "nothing to clone", 1 for "something to clone", or * the usual negative error code. */ -int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *len, bool is_dedupe) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1822,7 +1822,7 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, return 1; } -EXPORT_SYMBOL(vfs_clone_file_prep); +EXPORT_SYMBOL(generic_remap_file_range_prep); int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) -- cgit From 6095028b455d775e369ae27875f698ff0f6fdeb8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:14 +1100 Subject: vfs: rename clone_verify_area to remap_verify_area Since we use clone_verify_area for both clone and dedupe range checks, rename the function to make it clear that it's for both. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index aca75a97a695..734c5661fb69 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1686,7 +1686,7 @@ out2: return ret; } -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool write) { struct inode *inode = file_inode(file); @@ -1852,11 +1852,11 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->clone_file_range) return -EOPNOTSUPP; - ret = clone_verify_area(file_in, pos_in, len, false); + ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; - ret = clone_verify_area(file_out, pos_out, len, true); + ret = remap_verify_area(file_out, pos_out, len, true); if (ret) return ret; @@ -1989,7 +1989,7 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, if (ret) return ret; - ret = clone_verify_area(dst_file, dst_pos, len, true); + ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret < 0) goto out_drop_write; @@ -2051,7 +2051,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) if (!S_ISREG(src->i_mode)) goto out; - ret = clone_verify_area(file, off, len, false); + ret = remap_verify_area(file, off, len, false); if (ret < 0) goto out; ret = 0; -- cgit From 2e5dfc99f2e61c42083ba742395e7a7b353513d1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:21 +1100 Subject: vfs: combine the clone and dedupe into a single remap_file_range Combine the clone_file_range and dedupe_file_range operations into a single remap_file_range file operation dispatch since they're fundamentally the same operation. The differences between the two can be made in the prep functions. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 734c5661fb69..766bdcb381f3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1588,9 +1588,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * Try cloning first, this is supported by more file systems, and * more efficient if both clone and copy are supported (e.g. NFS). */ - if (file_in->f_op->clone_file_range) { - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); + if (file_in->f_op->remap_file_range) { + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, 0); if (ret == 0) { ret = len; goto done; @@ -1849,7 +1849,7 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, (file_out->f_flags & O_APPEND)) return -EBADF; - if (!file_in->f_op->clone_file_range) + if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; ret = remap_verify_area(file_in, pos_in, len, false); @@ -1860,8 +1860,8 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; - ret = file_in->f_op->clone_file_range(file_in, pos_in, - file_out, pos_out, len); + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, 0); if (!ret) { fsnotify_access(file_in); fsnotify_modify(file_out); @@ -2006,7 +2006,7 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; ret = -EINVAL; - if (!dst_file->f_op->dedupe_file_range) + if (!dst_file->f_op->remap_file_range) goto out_drop_write; if (len == 0) { @@ -2014,8 +2014,8 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, goto out_drop_write; } - ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, - dst_file, dst_pos, len); + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); -- cgit From a91ae49bbaf43910edb09e03fedf26b23875bd52 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:28 +1100 Subject: vfs: pass remap flags to generic_remap_file_range_prep Plumb the remap flags through the filesystem from the vfs function dispatcher all the way to the prep function to prepare for behavior changes in subsequent patches. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 766bdcb381f3..201381689284 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1722,14 +1722,14 @@ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, u64 *len, - bool is_dedupe) + unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; if ((*len & blkmask) == 0) return 0; - if (is_dedupe) + if (remap_flags & REMAP_FILE_DEDUP) *len &= ~blkmask; else if (pos_out + *len < i_size_read(inode_out)) return -EINVAL; @@ -1747,7 +1747,7 @@ static int generic_remap_check_len(struct inode *inode_in, */ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, bool is_dedupe) + u64 *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1771,7 +1771,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (*len == 0) { loff_t isize = i_size_read(inode_in); - if (is_dedupe || pos_in == isize) + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; @@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - is_dedupe); + (remap_flags & REMAP_FILE_DEDUP)); if (ret) return ret; @@ -1804,7 +1804,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* * Check that the extents are the same. */ - if (is_dedupe) { + if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; ret = vfs_dedupe_file_range_compare(inode_in, pos_in, @@ -1816,7 +1816,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - is_dedupe); + remap_flags); if (ret) return ret; -- cgit From 3d28193e1df043764deb7abdaba5e3a6660bc393 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:34 +1100 Subject: vfs: pass remap flags to generic_remap_checks Pass the same remap flags to generic_remap_checks for consistency. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 201381689284..ebcbfc4f2907 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - (remap_flags & REMAP_FILE_DEDUP)); + remap_flags); if (ret) return ret; -- cgit From 8dde90bca6fca3736ea20109654bcf6dcf2ecf1d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:41 +1100 Subject: vfs: remap helper should update destination inode metadata Extend generic_remap_file_range_prep to handle inode metadata updates when remapping into a file. If the operation can possibly alter the file contents, we must update the ctime and mtime and remove security privileges, just like we do for regular file writes. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index ebcbfc4f2907..b61bd3fc7154 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1820,6 +1820,25 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (ret) return ret; + /* If can't alter the file contents, we're done. */ + if (!(remap_flags & REMAP_FILE_DEDUP)) { + /* Update the timestamps, since we can alter file contents. */ + if (!(file_out->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file_out); + if (ret) + return ret; + } + + /* + * Clear the security bits if the process is not being run by + * root. This keeps people from modifying setuid and setgid + * binaries. + */ + ret = file_remove_privs(file_out); + if (ret) + return ret; + } + return 1; } EXPORT_SYMBOL(generic_remap_file_range_prep); -- cgit From 42ec3d4c02187a18e27ff94b409ec27234bf2ffd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:49 +1100 Subject: vfs: make remap_file_range functions take and return bytes completed Change the remap_file_range functions to take a number of bytes to operate upon and return the number of bytes they operated on. This is a requirement for allowing fs implementations to return short clone/dedupe results to the user, which will enable us to obey resource limits in a graceful manner. A subsequent patch will enable copy_file_range to signal to the ->clone_file_range implementation that it can handle a short length, which will be returned in the function's return value. For now the short return is not implemented anywhere so the behavior won't change -- either copy_file_range manages to clone the entire range or it tries an alternative. Neither clone ioctl can take advantage of this, alas. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index b61bd3fc7154..356641afa487 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1589,10 +1589,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * more efficient if both clone and copy are supported (e.g. NFS). */ if (file_in->f_op->remap_file_range) { - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, 0); - if (ret == 0) { - ret = len; + loff_t cloned; + + cloned = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, + min_t(loff_t, MAX_RW_COUNT, len), 0); + if (cloned > 0) { + ret = cloned; goto done; } } @@ -1686,11 +1689,12 @@ out2: return ret; } -static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) { struct inode *inode = file_inode(file); - if (unlikely(pos < 0)) + if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) @@ -1721,7 +1725,7 @@ static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool write) static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, - u64 *len, + loff_t *len, unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; @@ -1747,7 +1751,7 @@ static int generic_remap_check_len(struct inode *inode_in, */ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, unsigned int remap_flags) + loff_t *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1843,12 +1847,12 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(generic_remap_file_range_prep); -int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); - int ret; + loff_t ret; if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; @@ -1881,19 +1885,19 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, 0); - if (!ret) { - fsnotify_access(file_in); - fsnotify_modify(file_out); - } + if (ret < 0) + return ret; + fsnotify_access(file_in); + fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(do_clone_file_range); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len) { - int ret; + loff_t ret; file_start_write(file_out); ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); @@ -1999,10 +2003,11 @@ out_error: } EXPORT_SYMBOL(vfs_dedupe_file_range_compare); -int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, u64 len) +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len) { - s64 ret; + loff_t ret; ret = mnt_want_write_file(dst_file); if (ret) @@ -2051,7 +2056,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) int i; int ret; u16 count = same->dest_count; - int deduped; + loff_t deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; -- cgit From 452ce65951a2f0719e4e119ecca134c06cfe22ee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:41:56 +1100 Subject: vfs: plumb remap flags through the vfs clone functions Plumb a remap_flags argument through the {do,vfs}_clone_file_range functions so that clone can take advantage of it. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 356641afa487..0d1ac1b9bc22 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1848,12 +1848,15 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, loff_t len) + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); loff_t ret; + WARN_ON_ONCE(remap_flags); + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) @@ -1884,7 +1887,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, return ret; ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, 0); + file_out, pos_out, len, remap_flags); if (ret < 0) return ret; @@ -1895,12 +1898,14 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(do_clone_file_range); loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, loff_t len) + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { loff_t ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); file_end_write(file_out); return ret; -- cgit From df3658361951e17364f1e1c3fa92862a990ad8bd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:42:03 +1100 Subject: vfs: plumb remap flags through the vfs dedupe functions Plumb a remap_flags argument through the vfs_dedupe_file_range_one functions so that dedupe can take advantage of it. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index 0d1ac1b9bc22..ea30666013b0 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2010,10 +2010,12 @@ EXPORT_SYMBOL(vfs_dedupe_file_range_compare); loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, - loff_t len) + loff_t len, unsigned int remap_flags) { loff_t ret; + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP)); + ret = mnt_want_write_file(dst_file); if (ret) return ret; @@ -2044,7 +2046,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, } ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, - dst_pos, len, REMAP_FILE_DEDUP); + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); @@ -2112,7 +2114,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len); + info->dest_offset, len, + 0); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) -- cgit From eca3654e3cc7d93e9734d0fa96cfb15c7f356244 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:42:10 +1100 Subject: vfs: enable remap callers that can handle short operations Plumb in a remap flag that enables the filesystem remap handler to shorten remapping requests for callers that can handle it. Now copy_file_range can report partial success (in case we run up against alignment problems, resource limits, etc.). We also enable CAN_SHORTEN for fideduperange to maintain existing userspace-visible behavior where xfs/btrfs shorten the dedupe range to avoid stale post-eof data exposure. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Signed-off-by: Dave Chinner --- fs/read_write.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index ea30666013b0..c0bcc1a20650 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1593,7 +1593,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, cloned = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), 0); + min_t(loff_t, MAX_RW_COUNT, len), + REMAP_FILE_CAN_SHORTEN); if (cloned > 0) { ret = cloned; goto done; @@ -1721,6 +1722,8 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, * can't meaningfully compare post-EOF contents. * * For clone we only link a partial EOF block above the destination file's EOF. + * + * Shorten the request if possible. */ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, @@ -1729,16 +1732,24 @@ static int generic_remap_check_len(struct inode *inode_in, unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; if ((*len & blkmask) == 0) return 0; - if (remap_flags & REMAP_FILE_DEDUP) - *len &= ~blkmask; - else if (pos_out + *len < i_size_read(inode_out)) - return -EINVAL; + if ((remap_flags & REMAP_FILE_DEDUP) || + pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; - return 0; + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } /* @@ -2014,7 +2025,8 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, { loff_t ret; - WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP)); + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); ret = mnt_want_write_file(dst_file); if (ret) @@ -2115,7 +2127,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) deduped = vfs_dedupe_file_range_one(file, off, dst_file, info->dest_offset, len, - 0); + REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) -- cgit From c32e5f39953fa6bbff35c655bdcb7b3128f1e79f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:42:17 +1100 Subject: vfs: hide file range comparison function There are no callers of vfs_dedupe_file_range_compare, so we might as well make it a static helper and remove the export. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 187 +++++++++++++++++++++++++++----------------------------- 1 file changed, 91 insertions(+), 96 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index c0bcc1a20650..e4d295d0d236 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1752,6 +1752,97 @@ static int generic_remap_check_len(struct inode *inode_in, return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} + /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the @@ -1923,102 +2014,6 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_range); -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) -{ - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; -} - -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) -{ - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - return error; -} -EXPORT_SYMBOL(vfs_dedupe_file_range_compare); - loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags) -- cgit From 8c5c836bd6c3b9f9fc1c5a210d630b8c42f4f7df Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 30 Oct 2018 10:42:24 +1100 Subject: vfs: clean up generic_remap_file_range_prep return value Since the remap prep function can update the length of the remap request, we can change this function to return the usual return status instead of the odd behavior it has now. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/read_write.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/read_write.c') diff --git a/fs/read_write.c b/fs/read_write.c index e4d295d0d236..6b40a43edf18 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1848,8 +1848,8 @@ out_error: * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. * - * Returns: 0 for "nothing to clone", 1 for "something to clone", or - * the usual negative error code. + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. */ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, @@ -1945,7 +1945,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } - return 1; + return 0; } EXPORT_SYMBOL(generic_remap_file_range_prep); -- cgit