diff options
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r-- | fs/direct-io.c | 89 |
1 files changed, 59 insertions, 30 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 62cf812ed0e5..874607bb6e02 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -45,6 +45,12 @@ #define DIO_PAGES 64 /* + * Flags for dio_complete() + */ +#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */ +#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */ + +/* * This code generally works in units of "dio_blocks". A dio_block is * somewhere between the hard sector size and the filesystem block size. it * is determined on a per-invocation basis. When talking to the filesystem @@ -213,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head]; } +/* + * Warn about a page cache invalidation failure during a direct io write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + /** * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation @@ -225,7 +252,7 @@ static inline struct page *dio_get_page(struct dio *dio, * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; @@ -259,33 +286,38 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (ret == 0) ret = transferred; + if (dio->end_io) { + // XXX: ki_pos?? + err = dio->end_io(dio->iocb, offset, ret, dio->private); + if (err) + ret = err; + } + /* * Try again to invalidate clean pages which might have been cached by * non-direct readahead, or faulted in by get_user_pages() if the source * of the write was an mmap'ed region of the file we're writing. Either * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. */ - if (ret > 0 && dio->op == REQ_OP_WRITE && + if (flags & DIO_COMPLETE_INVALIDATE && + ret > 0 && dio->op == REQ_OP_WRITE && dio->inode->i_mapping->nrpages) { err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, (offset + ret - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); - } - - if (dio->end_io) { - - // XXX: ki_pos?? - err = dio->end_io(dio->iocb, offset, ret, dio->private); if (err) - ret = err; + dio_warn_stale_pagecache(dio->iocb->ki_filp); } - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) - inode_dio_end(dio->inode); + inode_dio_end(dio->inode); - if (is_async) { + if (flags & DIO_COMPLETE_ASYNC) { /* * generic_write_sync expects ki_pos to have been updated * already, but the submission path only does this for @@ -306,7 +338,7 @@ static void dio_aio_complete_work(struct work_struct *work) { struct dio *dio = container_of(work, struct dio, complete_work); - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE); } static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); @@ -348,7 +380,7 @@ static void dio_bio_end_aio(struct bio *bio) queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); } else { - dio_complete(dio, 0, true); + dio_complete(dio, 0, DIO_COMPLETE_ASYNC); } } } @@ -486,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -866,7 +898,8 @@ out: */ if (sdio->boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); + if (sdio->bio) + dio_bio_submit(dio, sdio); put_page(sdio->cur_page); sdio->cur_page = NULL; } @@ -1140,13 +1173,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { - unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - size_t count = iov_iter_count(iter); + const size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; - loff_t end = offset + count; + const loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; struct buffer_head map_bh = { 0, }; @@ -1167,7 +1200,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } /* watch out for a 0 len io from a tricksy fs */ - if (iov_iter_rw(iter) == READ && !iov_iter_count(iter)) + if (iov_iter_rw(iter) == READ && !count) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); @@ -1218,8 +1251,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (is_sync_kiocb(iocb)) dio->is_async = false; - else if (!(dio->flags & DIO_ASYNC_EXTEND) && - iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) + else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) dio->is_async = false; else dio->is_async = true; @@ -1240,8 +1272,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host)) + if (iocb->ki_flags & IOCB_DSYNC) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* @@ -1264,8 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * Will be decremented at I/O completion time. */ - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) - inode_dio_begin(inode); + inode_dio_begin(inode); retval = 0; sdio.blkbits = blkbits; @@ -1285,8 +1315,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->should_dirty = (iter->type == ITER_IOVEC); sdio.iter = iter; - sdio.final_block_in_request = - (offset + iov_iter_count(iter)) >> blkbits; + sdio.final_block_in_request = end >> blkbits; /* * In case of non-aligned buffers, we may need 2 more @@ -1359,7 +1388,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, retval, false); + retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE); } else BUG_ON(retval != -EIOCBQUEUED); |