summaryrefslogtreecommitdiff
path: root/fs/direct-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r--fs/direct-io.c89
1 files changed, 59 insertions, 30 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 62cf812ed0e5..874607bb6e02 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -45,6 +45,12 @@
#define DIO_PAGES 64
/*
+ * Flags for dio_complete()
+ */
+#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */
+#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */
+
+/*
* This code generally works in units of "dio_blocks". A dio_block is
* somewhere between the hard sector size and the filesystem block size. it
* is determined on a per-invocation basis. When talking to the filesystem
@@ -213,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio,
return dio->pages[sdio->head];
}
+/*
+ * Warn about a page cache invalidation failure during a direct io write.
+ */
+void dio_warn_stale_pagecache(struct file *filp)
+{
+ static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
+ char pathname[128];
+ struct inode *inode = file_inode(filp);
+ char *path;
+
+ errseq_set(&inode->i_mapping->wb_err, -EIO);
+ if (__ratelimit(&_rs)) {
+ path = file_path(filp, pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
+ pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+ current->comm);
+ }
+}
+
/**
* dio_complete() - called when all DIO BIO I/O has been completed
* @offset: the byte offset in the file of the completed operation
@@ -225,7 +252,7 @@ static inline struct page *dio_get_page(struct dio *dio,
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
{
loff_t offset = dio->iocb->ki_pos;
ssize_t transferred = 0;
@@ -259,33 +286,38 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
if (ret == 0)
ret = transferred;
+ if (dio->end_io) {
+ // XXX: ki_pos??
+ err = dio->end_io(dio->iocb, offset, ret, dio->private);
+ if (err)
+ ret = err;
+ }
+
/*
* Try again to invalidate clean pages which might have been cached by
* non-direct readahead, or faulted in by get_user_pages() if the source
* of the write was an mmap'ed region of the file we're writing. Either
* one is a pretty crazy thing to do, so we don't support it 100%. If
* this invalidation fails, tough, the write still worked...
+ *
+ * And this page cache invalidation has to be after dio->end_io(), as
+ * some filesystems convert unwritten extents to real allocations in
+ * end_io() when necessary, otherwise a racing buffer read would cache
+ * zeros from unwritten extents.
*/
- if (ret > 0 && dio->op == REQ_OP_WRITE &&
+ if (flags & DIO_COMPLETE_INVALIDATE &&
+ ret > 0 && dio->op == REQ_OP_WRITE &&
dio->inode->i_mapping->nrpages) {
err = invalidate_inode_pages2_range(dio->inode->i_mapping,
offset >> PAGE_SHIFT,
(offset + ret - 1) >> PAGE_SHIFT);
- WARN_ON_ONCE(err);
- }
-
- if (dio->end_io) {
-
- // XXX: ki_pos??
- err = dio->end_io(dio->iocb, offset, ret, dio->private);
if (err)
- ret = err;
+ dio_warn_stale_pagecache(dio->iocb->ki_filp);
}
- if (!(dio->flags & DIO_SKIP_DIO_COUNT))
- inode_dio_end(dio->inode);
+ inode_dio_end(dio->inode);
- if (is_async) {
+ if (flags & DIO_COMPLETE_ASYNC) {
/*
* generic_write_sync expects ki_pos to have been updated
* already, but the submission path only does this for
@@ -306,7 +338,7 @@ static void dio_aio_complete_work(struct work_struct *work)
{
struct dio *dio = container_of(work, struct dio, complete_work);
- dio_complete(dio, 0, true);
+ dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
}
static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
@@ -348,7 +380,7 @@ static void dio_bio_end_aio(struct bio *bio)
queue_work(dio->inode->i_sb->s_dio_done_wq,
&dio->complete_work);
} else {
- dio_complete(dio, 0, true);
+ dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
}
}
}
@@ -486,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio)
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
+ !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
@@ -866,7 +898,8 @@ out:
*/
if (sdio->boundary) {
ret = dio_send_cur_page(dio, sdio, map_bh);
- dio_bio_submit(dio, sdio);
+ if (sdio->bio)
+ dio_bio_submit(dio, sdio);
put_page(sdio->cur_page);
sdio->cur_page = NULL;
}
@@ -1140,13 +1173,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
- unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+ unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
- size_t count = iov_iter_count(iter);
+ const size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
- loff_t end = offset + count;
+ const loff_t end = offset + count;
struct dio *dio;
struct dio_submit sdio = { 0, };
struct buffer_head map_bh = { 0, };
@@ -1167,7 +1200,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
}
/* watch out for a 0 len io from a tricksy fs */
- if (iov_iter_rw(iter) == READ && !iov_iter_count(iter))
+ if (iov_iter_rw(iter) == READ && !count)
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1218,8 +1251,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
if (is_sync_kiocb(iocb))
dio->is_async = false;
- else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
- iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
+ else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
dio->is_async = false;
else
dio->is_async = true;
@@ -1240,8 +1272,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
if (dio->is_async && iov_iter_rw(iter) == WRITE) {
retval = 0;
- if ((iocb->ki_filp->f_flags & O_DSYNC) ||
- IS_SYNC(iocb->ki_filp->f_mapping->host))
+ if (iocb->ki_flags & IOCB_DSYNC)
retval = dio_set_defer_completion(dio);
else if (!dio->inode->i_sb->s_dio_done_wq) {
/*
@@ -1264,8 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
/*
* Will be decremented at I/O completion time.
*/
- if (!(dio->flags & DIO_SKIP_DIO_COUNT))
- inode_dio_begin(inode);
+ inode_dio_begin(inode);
retval = 0;
sdio.blkbits = blkbits;
@@ -1285,8 +1315,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->should_dirty = (iter->type == ITER_IOVEC);
sdio.iter = iter;
- sdio.final_block_in_request =
- (offset + iov_iter_count(iter)) >> blkbits;
+ sdio.final_block_in_request = end >> blkbits;
/*
* In case of non-aligned buffers, we may need 2 more
@@ -1359,7 +1388,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio_await_completion(dio);
if (drop_refcount(dio) == 0) {
- retval = dio_complete(dio, retval, false);
+ retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
} else
BUG_ON(retval != -EIOCBQUEUED);