diff options
Diffstat (limited to 'fs/ext4/file.c')
-rw-r--r-- | fs/ext4/file.c | 532 |
1 files changed, 446 insertions, 86 deletions
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8d2bbcc2d813..0d624250a62b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -29,10 +29,58 @@ #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> +#include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "truncate.h" + +static bool ext4_dio_supported(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) + return false; + if (fsverity_active(inode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + return true; +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -40,9 +88,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - if (!inode_trylock_shared(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) return -EAGAIN; + } else { inode_lock_shared(inode); } /* @@ -64,16 +113,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX - if (IS_DAX(file_inode(iocb->ki_filp))) + if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -103,13 +157,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); -} - /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they @@ -119,19 +166,25 @@ static void ext4_unwritten_wait(struct inode *inode) * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ -static int -ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; - int blockmask = sb->s_blocksize - 1; - - if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) - return 0; + unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) - return 1; + return true; - return 0; + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; } /* Is IO overwriting allocated and initialized blocks? */ @@ -157,18 +210,19 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); } -static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -180,110 +234,413 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + return iov_iter_count(from); } -#ifdef CONFIG_FS_DAX -static ssize_t -ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return count; +} + +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - inode_lock(inode); - } + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; - ret = file_remove_privs(iocb->ki_filp); - if (ret) - goto out; - ret = file_update_time(iocb->ki_filp); - if (ret) - goto out; - ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + current->backing_dev_info = NULL; + out: inode_unlock(inode); - if (ret > 0) + if (likely(ret > 0)) { + iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); + } + return ret; } -#endif -static ssize_t -ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, size_t count) { - struct inode *inode = file_inode(iocb->ki_filp); - int o_direct = iocb->ki_flags & IOCB_DIRECT; - int unaligned_aio = 0; - int overwrite = 0; - ssize_t ret; + handle_t *handle; + bool truncate = false; + u8 blkbits = inode->i_blkbits; + ext4_lblk_t written_blk, end_blk; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But, the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. + */ + WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); + if (offset + count <= EXT4_I(inode)->i_disksize) { + /* + * We need to ensure that the inode is removed from the orphan + * list if it has been added prematurely, due to writeback of + * delalloc blocks. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } -#ifdef CONFIG_FS_DAX - if (IS_DAX(inode)) - return ext4_dax_write_iter(iocb, from); -#endif + return written; + } - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - inode_lock(inode); + if (written < 0) + goto truncate; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + goto truncate; } - ret = ext4_write_checks(iocb, from); + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + written_blk = ALIGN(offset + written, 1 << blkbits); + end_blk = ALIGN(offset + count, 1 << blkbits); + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + + /* + * Remove the inode from the orphan list if it has been extended and + * everything went OK. + */ + if (!truncate && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + if (truncate) { +truncate: + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return written; +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t offset = iocb->ki_pos; + struct inode *inode = file_inode(iocb->ki_filp); + + if (error) + return error; + + if (size && flags & IOMAP_DIO_UNWRITTEN) + return ext4_convert_unwritten_extents(NULL, inode, + offset, size); + + return 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites. Otherwise we will + * switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + +restart: + ret = ext4_generic_write_checks(iocb, from); if (ret <= 0) goto out; + offset = iocb->ki_pos; + count = ret; + if (ext4_extending_io(inode, offset, count)) + *extend = true; /* - * Unaligned direct AIO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned AIOs can result in data - * corruption. + * Determine whether the IO operation will overwrite allocated + * and initialized blocks. + * We need exclusive i_rwsem for changing security info + * in file_modified(). */ - if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && - ext4_unaligned_aio(inode, from, iocb->ki_pos)) { - unaligned_aio = 1; - ext4_unwritten_wait(inode); + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || + !ext4_overwrite_io(inode, offset, count))) { + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; } - iocb->private = &overwrite; - /* Check whether we do a DIO overwrite or not */ - if (o_direct && !unaligned_aio) { - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { - if (ext4_should_dioread_nolock(inode)) - overwrite = 1; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + bool extend = false, unaligned_io = false; + bool ilock_shared = true; + + /* + * We initially start with shared inode lock unless it is + * unaligned IO which needs exclusive lock anyways. + */ + if (ext4_unaligned_io(inode, from, offset)) { + unaligned_io = true; + ilock_shared = false; + } + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; } + } else { + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); + } + + /* Fallback to buffered I/O if the inode does not support direct I/O. */ + if (!ext4_dio_supported(inode)) { + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ext4_buffered_write_iter(iocb, from); } - ret = __generic_file_write_iter(iocb, from); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + if (ret <= 0) + return ret; + + offset = iocb->ki_pos; + count = ret; + /* - * Unaligned direct AIO must be the only IO in flight. Otherwise - * overlapping aligned IO after unaligned might result in data + * Unaligned direct IO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned IOs can result in data * corruption. + * + * So we make sure we don't allow any unaligned IO in flight. + * For IOs where we need not wait (like unaligned non-AIO DIO), + * below inode_dio_wait() may anyway become a no-op, since we start + * with exclusive lock. */ - if (ret == -EIOCBQUEUED && unaligned_aio) - ext4_unwritten_wait(inode); - inode_unlock(inode); + if (unaligned_io) + inode_dio_wait(inode); - if (ret > 0) - ret = generic_write_sync(iocb, ret); + if (extend) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + ext4_journal_stop(handle); + } + + if (ilock_shared) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io || extend); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); + +out: + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } return ret; +} + +#ifdef CONFIG_FS_DAX +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = iov_iter_count(from); + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } +#endif + +static ssize_t +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); + + return ext4_buffered_write_iter(iocb, from); +} #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, @@ -494,12 +851,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); - offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); - offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } @@ -513,6 +872,7 @@ const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, |