diff options
Diffstat (limited to 'fs/direct-io.c')
| -rw-r--r-- | fs/direct-io.c | 677 |
1 files changed, 345 insertions, 332 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 7ab90f5081ee..2267f5ae7f77 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * fs/direct-io.c * @@ -36,16 +37,22 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> -#include <linux/prefetch.h> -#include <linux/aio.h> + +#include "internal.h" /* - * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure in the slab cache + * How many user pages to map in one call to iov_iter_extract_pages(). This + * determines the size of a structure in the slab cache */ #define DIO_PAGES 64 /* + * Flags for dio_complete() + */ +#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */ +#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */ + +/* * This code generally works in units of "dio_blocks". A dio_block is * somewhere between the hard sector size and the filesystem block size. it * is determined on a per-invocation basis. When talking to the filesystem @@ -71,16 +78,13 @@ struct dio_submit { been performed at the start of a write */ int pages_in_io; /* approximate total IO pages */ - size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ int reap_counter; /* rate limit reaping */ sector_t final_block_in_request;/* doesn't change */ - unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ get_block_t *get_block; /* block mapping function */ - dio_submit_t *submit_io; /* IO submition function */ loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ @@ -98,35 +102,34 @@ struct dio_submit { sector_t cur_page_block; /* Where it starts */ loff_t cur_page_fs_offset; /* Offset in file */ - /* - * Page fetching state. These variables belong to dio_refill_pages(). - */ - int curr_page; /* changes */ - int total_pages; /* doesn't change */ - unsigned long curr_user_address;/* changes */ - + struct iov_iter *iter; /* * Page queue. These variables belong to dio_refill_pages() and * dio_get_page(). */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ + size_t from, to; }; /* dio_state communicated between submission path and end_io */ struct dio { int flags; /* doesn't change */ - int rw; + blk_opf_t opf; /* request operation type and flags */ + struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ + bool is_pinned; /* T if we have pins on the pages */ void *private; /* copy from map_bh.b_private */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ - int page_errors; /* errno from get_user_pages() */ + int page_errors; /* err from iov_iter_extract_pages() */ int is_async; /* is IO async ? */ + bool defer_completion; /* defer AIO completion to workqueue? */ + bool should_dirty; /* if pages should be dirtied */ int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ @@ -141,10 +144,13 @@ struct dio { * allocation time. Don't add new fields after pages[] unless you * wish that they not be zeroed. */ - struct page *pages[DIO_PAGES]; /* page buffer */ + union { + struct page *pages[DIO_PAGES]; /* page buffer */ + struct work_struct complete_work;/* deferred AIO completion */ + }; } ____cacheline_aligned_in_smp; -static struct kmem_cache *dio_cache __read_mostly; +static struct kmem_cache *dio_cache __ro_after_init; /* * How many pages are in the queue? @@ -159,18 +165,14 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) */ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { - int ret; - int nr_pages; + struct page **pages = dio->pages; + const enum req_op dio_op = dio->opf & REQ_OP_MASK; + ssize_t ret; - nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); - ret = get_user_pages_fast( - sdio->curr_user_address, /* Where from? */ - nr_pages, /* How many pages? */ - dio->rw == READ, /* Write to memory? */ - &dio->pages[0]); /* Put results here */ + ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, + DIO_PAGES, 0, &sdio->from); - if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { - struct page *page = ZERO_PAGE(0); + if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { /* * A memory fault, but the filesystem has some outstanding * mapped blocks. We need to use those blocks up to avoid @@ -178,33 +180,32 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) */ if (dio->page_errors == 0) dio->page_errors = ret; - page_cache_get(page); - dio->pages[0] = page; + dio->pages[0] = ZERO_PAGE(0); sdio->head = 0; sdio->tail = 1; - ret = 0; - goto out; + sdio->from = 0; + sdio->to = PAGE_SIZE; + return 0; } if (ret >= 0) { - sdio->curr_user_address += ret * PAGE_SIZE; - sdio->curr_page += ret; + ret += sdio->from; sdio->head = 0; - sdio->tail = ret; - ret = 0; + sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1; + return 0; } -out: return ret; } /* * Get another userspace page. Returns an ERR_PTR on error. Pages are - * buffered inside the dio so that we can call get_user_pages() against a - * decent number of pages, less frequently. To provide nicer use of the - * L1 cache. + * buffered inside the dio so that we can call iov_iter_extract_pages() + * against a decent number of pages, less frequently. To provide nicer use of + * the L1 cache. */ static inline struct page *dio_get_page(struct dio *dio, - struct dio_submit *sdio) + struct dio_submit *sdio) { if (dio_pages_present(sdio) == 0) { int ret; @@ -214,25 +215,38 @@ static inline struct page *dio_get_page(struct dio *dio, return ERR_PTR(ret); BUG_ON(dio_pages_present(sdio) == 0); } - return dio->pages[sdio->head++]; + return dio->pages[sdio->head]; } -/** +static void dio_pin_page(struct dio *dio, struct page *page) +{ + if (dio->is_pinned) + folio_add_pin(page_folio(page)); +} + +static void dio_unpin_page(struct dio *dio, struct page *page) +{ + if (dio->is_pinned) + unpin_user_page(page); +} + +/* * dio_complete() - called when all DIO BIO I/O has been completed - * @offset: the byte offset in the file of the completed operation * - * This releases locks as dictated by the locking type, lets interested parties - * know that a DIO operation has completed, and calculates the resulting return - * code for the operation. + * This drops i_dio_count, lets interested parties know that a DIO operation + * has completed, and calculates the resulting return code for the operation. * * It lets the filesystem know if it registered an interest earlier via * get_block. Pass the private field of the map buffer_head so that * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; + loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; + int err; /* * AIO submission can race with bio completion to get here while @@ -247,8 +261,12 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is transferred = dio->result; /* Check for short read case */ - if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) + if (dio_op == REQ_OP_READ && + ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; + /* ignore EFAULT if some IO has been done */ + if (unlikely(ret == -EFAULT) && transferred) + ret = 0; } if (ret == 0) @@ -258,27 +276,67 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (ret == 0) ret = transferred; - if (dio->end_io && dio->result) { - dio->end_io(dio->iocb, offset, transferred, - dio->private, ret, is_async); - } else { - inode_dio_done(dio->inode); - if (is_async) - aio_complete(dio->iocb, ret, 0); + if (dio->end_io) { + // XXX: ki_pos?? + err = dio->end_io(dio->iocb, offset, ret, dio->private); + if (err) + ret = err; } + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + * + * And this page cache invalidation has to be after dio->end_io(), as + * some filesystems convert unwritten extents to real allocations in + * end_io() when necessary, otherwise a racing buffer read would cache + * zeros from unwritten extents. + */ + if (flags & DIO_COMPLETE_INVALIDATE && + ret > 0 && dio_op == REQ_OP_WRITE) + kiocb_invalidate_post_direct_write(dio->iocb, ret); + + inode_dio_end(dio->inode); + + if (flags & DIO_COMPLETE_ASYNC) { + /* + * generic_write_sync expects ki_pos to have been updated + * already, but the submission path only does this for + * synchronous I/O. + */ + dio->iocb->ki_pos += transferred; + + if (ret > 0 && dio_op == REQ_OP_WRITE) + ret = generic_write_sync(dio->iocb, ret); + dio->iocb->ki_complete(dio->iocb, ret); + } + + kmem_cache_free(dio_cache, dio); return ret; } -static int dio_bio_complete(struct dio *dio, struct bio *bio); +static void dio_aio_complete_work(struct work_struct *work) +{ + struct dio *dio = container_of(work, struct dio, complete_work); + + dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE); +} + +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); + /* * Asynchronous IO callback. */ -static void dio_bio_end_aio(struct bio *bio, int error) +static void dio_bio_end_aio(struct bio *bio) { struct dio *dio = bio->bi_private; + const enum req_op dio_op = dio->opf & REQ_OP_MASK; unsigned long remaining; unsigned long flags; + bool defer_completion = false; /* cleanup the bio */ dio_bio_complete(dio, bio); @@ -290,8 +348,25 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - dio_complete(dio, dio->iocb->ki_pos, 0, true); - kmem_cache_free(dio_cache, dio); + /* + * Defer completion when defer_completion is set or + * when the inode has pages mapped and this is AIO write. + * We need to invalidate those pages because there is a + * chance they contain stale data in the case buffered IO + * went in between AIO submission and completion into the + * same region. + */ + if (dio->result) + defer_completion = dio->defer_completion || + (dio_op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages); + if (defer_completion) { + INIT_WORK(&dio->complete_work, dio_aio_complete_work); + queue_work(dio->inode->i_sb->s_dio_done_wq, + &dio->complete_work); + } else { + dio_complete(dio, 0, DIO_COMPLETE_ASYNC); + } } } @@ -302,7 +377,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) * During I/O bi_private points at the dio. After I/O, bi_private is used to * implement a singly-linked list of completed BIOs, at dio->bio_list. */ -static void dio_bio_end_io(struct bio *bio, int error) +static void dio_bio_end_io(struct bio *bio) { struct dio *dio = bio->bi_private; unsigned long flags; @@ -315,26 +390,6 @@ static void dio_bio_end_io(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * @error: Error if there was one - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio, int error) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio, error); - else - dio_bio_end_io(bio, error); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, @@ -343,17 +398,18 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct bio *bio; /* - * bio_alloc() is guaranteed to return a bio when called with - * __GFP_WAIT and we request a valid number of vectors. + * bio_alloc() is guaranteed to return a bio when allowed to sleep and + * we request a valid number of vectors. */ - bio = bio_alloc(GFP_KERNEL, nr_vecs); - - bio->bi_bdev = bdev; - bio->bi_sector = first_sector; + bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL); + bio->bi_iter.bi_sector = first_sector; if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else bio->bi_end_io = dio_bio_end_io; + if (dio->is_pinned) + bio_set_flag(bio, BIO_PAGE_PINNED); + bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint; sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; @@ -368,6 +424,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, */ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; struct bio *bio = sdio->bio; unsigned long flags; @@ -377,14 +434,12 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->rw == READ) + if (dio->is_async && dio_op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); - if (sdio->submit_io) - sdio->submit_io(dio->rw, bio, dio->inode, - sdio->logical_offset_in_bio); - else - submit_bio(dio->rw, bio); + dio->bio_disk = bio->bi_bdev->bd_disk; + + submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -396,15 +451,17 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) */ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (dio_pages_present(sdio)) - page_cache_release(dio_get_page(dio, sdio)); + if (dio->is_pinned) + unpin_user_pages(dio->pages + sdio->head, + sdio->tail - sdio->head); + sdio->head = sdio->tail; } /* * Wait for the next BIO to complete. Remove it and return it. NULL is * returned once all BIOs have been completed. This must only be called once * all bios have been issued so that dio->refcount can only decrease. This - * requires that that the caller hold a reference on the dio. + * requires that the caller hold a reference on the dio. */ static struct bio *dio_await_one(struct dio *dio) { @@ -423,7 +480,7 @@ static struct bio *dio_await_one(struct dio *dio) __set_current_state(TASK_UNINTERRUPTIBLE); dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; @@ -439,28 +496,26 @@ static struct bio *dio_await_one(struct dio *dio) /* * Process one completed BIO. No locks are held. */ -static int dio_bio_complete(struct dio *dio, struct bio *bio) +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec; - unsigned i; - - if (!uptodate) - dio->io_error = -EIO; + blk_status_t err = bio->bi_status; + const enum req_op dio_op = dio->opf & REQ_OP_MASK; + bool should_dirty = dio_op == REQ_OP_READ && dio->should_dirty; + + if (err) { + if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) + dio->io_error = -EAGAIN; + else + dio->io_error = -EIO; + } - if (dio->is_async && dio->rw == READ) { + if (dio->is_async && should_dirty) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (dio->rw == READ && !PageCompound(page)) - set_page_dirty_lock(page); - page_cache_release(page); - } + bio_release_pages(bio, should_dirty); bio_put(bio); } - return uptodate ? 0 : -EIO; + return err; } /* @@ -501,7 +556,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) bio = dio->bio_list; dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); - ret2 = dio_bio_complete(dio, bio); + ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); if (ret == 0) ret = ret2; } @@ -510,10 +565,22 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) return ret; } +static int dio_set_defer_completion(struct dio *dio) +{ + struct super_block *sb = dio->inode->i_sb; + + if (dio->defer_completion) + return 0; + dio->defer_completion = true; + if (!sb->s_dio_done_wq) + return sb_init_dio_done_wq(sb); + return 0; +} + /* * Call into the fs to map some more disk blocks. We record the current number * of available blocks at sdio->blocks_available. These are in units of the - * fs blocksize, (1 << inode->i_blkbits). + * fs blocksize, i_blocksize(inode). * * The fs is allowed to map lots of blocks at once. If it wants to do that, * it uses the passed inode-relative block number as the file offset, as usual. @@ -536,12 +603,14 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; + loff_t i_size; /* * If there was a memory error and we've overwritten all the @@ -559,20 +628,20 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, map_bh->b_size = fs_count << i_blkbits; /* - * For writes inside i_size on a DIO_SKIP_HOLES filesystem we - * forbid block creations: only overwrites are permitted. - * We will return early to the caller once we see an - * unmapped buffer head returned, and the caller will fall - * back to buffered I/O. + * For writes that could fill holes inside i_size on a + * DIO_SKIP_HOLES filesystem we forbid block creations: only + * overwrites are permitted. We will return early to the caller + * once we see an unmapped buffer head returned, and the caller + * will fall back to buffered I/O. * * Otherwise the decision is left to the get_blocks method, * which may decide to handle it or also return an unmapped * buffer head. */ - create = dio->rw & WRITE; + create = dio_op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (sdio->block_in_file < (i_size_read(dio->inode) >> - sdio->blkbits)) + i_size = i_size_read(dio->inode); + if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) create = 0; } @@ -581,6 +650,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, /* Store for completion */ dio->private = map_bh->b_private; + + if (ret == 0 && buffer_defer_completion(map_bh)) + ret = dio_set_defer_completion(dio); } return ret; } @@ -598,8 +670,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, if (ret) goto out; sector = start_sector << (sdio->blkbits - 9); - nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); - nr_pages = min(nr_pages, BIO_MAX_PAGES); + nr_pages = bio_max_segs(sdio->pages_in_io); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; @@ -614,7 +685,7 @@ out: * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static inline int dio_bio_add_page(struct dio_submit *sdio) +static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio) { int ret; @@ -626,7 +697,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio) */ if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) sdio->pages_in_io--; - page_cache_get(sdio->cur_page); + dio_pin_page(dio, sdio->cur_page); sdio->final_block_in_bio = sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits); ret = 0; @@ -654,7 +725,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, if (sdio->bio) { loff_t cur_offset = sdio->cur_page_fs_offset; loff_t bio_next_offset = sdio->logical_offset_in_bio + - sdio->bio->bi_size; + sdio->bio->bi_iter.bi_size; /* * See whether this new request is contiguous with the old. @@ -681,11 +752,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, goto out; } - if (dio_bio_add_page(sdio) != 0) { + if (dio_bio_add_page(dio, sdio) != 0) { dio_bio_submit(dio, sdio); ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { - ret = dio_bio_add_page(sdio); + ret = dio_bio_add_page(dio, sdio); BUG_ON(ret != 0); } } @@ -715,9 +786,11 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, unsigned offset, unsigned len, sector_t blocknr, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; int ret = 0; + int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ - if (dio->rw & WRITE) { + if (dio_op == REQ_OP_WRITE) { /* * Read accounting is performed in submit_bio() */ @@ -740,13 +813,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->cur_page) { ret = dio_send_cur_page(dio, sdio, map_bh); - page_cache_release(sdio->cur_page); + dio_unpin_page(dio, sdio->cur_page); sdio->cur_page = NULL; if (ret) return ret; } - page_cache_get(page); /* It is in dio */ + dio_pin_page(dio, page); /* It is in dio */ sdio->cur_page = page; sdio->cur_page_offset = offset; sdio->cur_page_len = len; @@ -754,37 +827,20 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: /* - * If sdio->boundary then we want to schedule the IO now to + * If boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (sdio->boundary) { + if (boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); - page_cache_release(sdio->cur_page); + if (sdio->bio) + dio_bio_submit(dio, sdio); + dio_unpin_page(dio, sdio->cur_page); sdio->cur_page = NULL; } return ret; } /* - * Clean any dirty buffers in the blockdev mapping which alias newly-created - * file blocks. Only called for S_ISREG files - blockdevs do not set - * buffer_new - */ -static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) -{ - unsigned i; - unsigned nblocks; - - nblocks = map_bh->b_size >> dio->inode->i_blkbits; - - for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(map_bh->b_bdev, - map_bh->b_blocknr + i); - } -} - -/* * If we are not writing the entire block and get_block() allocated * the block for us, we need to fill-in the unused portion of the * block with zeros. This happens only if user-buffer, fileoffset or @@ -847,24 +903,25 @@ static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { + const enum req_op dio_op = dio->opf & REQ_OP_MASK; const unsigned blkbits = sdio->blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; - struct page *page; - unsigned block_in_page; + const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; - /* The I/O can start at any block offset within the first page */ - block_in_page = sdio->first_block_in_page; - while (sdio->block_in_file < sdio->final_block_in_request) { + struct page *page; + size_t from, to; + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + from = sdio->head ? 0 : sdio->from; + to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE; + sdio->head++; - while (block_in_page < blocks_per_page) { - unsigned offset_in_page = block_in_page << blkbits; + while (from < to) { unsigned this_chunk_bytes; /* # of bytes mapped */ unsigned this_chunk_blocks; /* # of blocks */ unsigned u; @@ -878,18 +935,22 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, ret = get_more_blocks(dio, sdio, map_bh); if (ret) { - page_cache_release(page); + dio_unpin_page(dio, page); goto out; } if (!buffer_mapped(map_bh)) goto do_holes; sdio->blocks_available = - map_bh->b_size >> sdio->blkbits; + map_bh->b_size >> blkbits; sdio->next_block_for_io = map_bh->b_blocknr << sdio->blkfactor; - if (buffer_new(map_bh)) - clean_blockdev_aliases(dio, map_bh); + if (buffer_new(map_bh)) { + clean_bdev_aliases( + map_bh->b_bdev, + map_bh->b_blocknr, + map_bh->b_size >> i_blkbits); + } if (!sdio->blkfactor) goto do_holes; @@ -918,8 +979,8 @@ do_holes: loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ - if (dio->rw & WRITE) { - page_cache_release(page); + if (dio_op == REQ_OP_WRITE) { + dio_unpin_page(dio, page); return -ENOTBLK; } @@ -932,13 +993,13 @@ do_holes: if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ - page_cache_release(page); + dio_unpin_page(dio, page); goto out; } - zero_user(page, block_in_page << blkbits, - 1 << blkbits); + memzero_page(page, from, 1 << blkbits); sdio->block_in_file++; - block_in_page++; + from += 1 << blkbits; + dio->result += 1 << blkbits; goto next_block; } @@ -955,7 +1016,7 @@ do_holes: * can add to this page */ this_chunk_blocks = sdio->blocks_available; - u = (PAGE_SIZE - offset_in_page) >> blkbits; + u = (to - from) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; u = sdio->final_block_in_request - sdio->block_in_file; @@ -967,18 +1028,19 @@ do_holes: if (this_chunk_blocks == sdio->blocks_available) sdio->boundary = buffer_boundary(map_bh); ret = submit_page_section(dio, sdio, page, - offset_in_page, + from, this_chunk_bytes, sdio->next_block_for_io, map_bh); if (ret) { - page_cache_release(page); + dio_unpin_page(dio, page); goto out; } sdio->next_block_for_io += this_chunk_blocks; sdio->block_in_file += this_chunk_blocks; - block_in_page += this_chunk_blocks; + from += this_chunk_bytes; + dio->result += this_chunk_bytes; sdio->blocks_available -= this_chunk_blocks; next_block: BUG_ON(sdio->block_in_file > sdio->final_block_in_request); @@ -986,9 +1048,8 @@ next_block: break; } - /* Drop the ref which was taken in get_user_pages() */ - page_cache_release(page); - block_in_page = 0; + /* Drop the pin which was taken in get_user_pages() */ + dio_unpin_page(dio, page); } out: return ret; @@ -1004,7 +1065,7 @@ static inline int drop_refcount(struct dio *dio) * operation. AIO can if it was a broken operation described above or * in fact if all the bios race to complete before we get here. In * that case dio_complete() translates the EIOCBQUEUED into the proper - * return code that the caller will hand to aio_complete(). + * return code that the caller will hand to ->complete(). * * This is managed by the bio_lock instead of being an atomic_t so that * completion paths can drop their ref and use the remaining count to @@ -1022,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio) * The locking rules are governed by the flags parameter: * - if the flags value contains DIO_LOCKING we use a fancy locking * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is + * For writes this function is called under i_rwsem and returns with + * i_rwsem held, for reads, i_rwsem is not held on entry, but it is * taken and dropped again before returning. * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize @@ -1033,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio) * counter before starting direct I/O, and decrement it once we are done. * Truncate can wait for it to reach zero to provide exclusion. It is * expected that filesystem provide exclusion between new direct I/O - * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, + * and truncates. For DIO_LOCKING filesystems this is done by i_rwsem, * but other filesystems need to take care of this on their own. * * NOTE: if you pass "sdio" to anything by pointer make sure that function @@ -1041,67 +1102,31 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -static inline ssize_t -do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) +ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + get_block_t get_block, dio_iodone_t end_io, + int flags) { - int seg; - size_t size; - unsigned long addr; - unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - loff_t end = offset; + const size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; + const loff_t end = offset + count; struct dio *dio; - struct dio_submit sdio = { 0, }; - unsigned long user_addr; - size_t bytes; + struct dio_submit sdio = { NULL, }; struct buffer_head map_bh = { 0, }; struct blk_plug plug; - - if (rw & WRITE) - rw = WRITE_ODIRECT; - - /* - * Avoid references to bdev if not absolutely needed to give - * the early prefetch in the caller enough time. - */ - - if (offset & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (offset & blocksize_mask) - goto out; - } - - /* Check the memory alignment. Blocks cannot straddle pages */ - for (seg = 0; seg < nr_segs; seg++) { - addr = (unsigned long)iov[seg].iov_base; - size = iov[seg].iov_len; - end += size; - if (unlikely((addr & blocksize_mask) || - (size & blocksize_mask))) { - if (bdev) - blkbits = blksize_bits( - bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - } - } + unsigned long align = offset | iov_iter_alignment(iter); /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && end == offset) + if (iov_iter_rw(iter) == READ && !count) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); - retval = -ENOMEM; if (!dio) - goto out; + return -ENOMEM; /* * Believe it or not, zeroing out the page array caused a .5% * performance regression in a database benchmark. So, we take @@ -1110,58 +1135,100 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, memset(dio, 0, offsetof(struct dio, pages)); dio->flags = flags; - if (dio->flags & DIO_LOCKING) { - if (rw == READ) { - struct address_space *mapping = - iocb->ki_filp->f_mapping; - - /* will be released by direct_io_worker */ - mutex_lock(&inode->i_mutex); - - retval = filemap_write_and_wait_range(mapping, offset, - end - 1); - if (retval) { - mutex_unlock(&inode->i_mutex); - kmem_cache_free(dio_cache, dio); - goto out; - } - } + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { + /* will be released by direct_io_worker */ + inode_lock(inode); + } + dio->is_pinned = iov_iter_extract_will_pin(iter); + + /* Once we sampled i_size check for reads beyond EOF */ + dio->i_size = i_size_read(inode); + if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { + retval = 0; + goto fail_dio; + } + + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + goto fail_dio; + } + + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + retval = filemap_write_and_wait_range(mapping, offset, end - 1); + if (retval) + goto fail_dio; } /* - * Will be decremented at I/O completion time. + * For file extending writes updating i_size before data writeouts + * complete can expose uninitialized blocks in dumb filesystems. + * In that case we need to wait for I/O completion even if asked + * for an asynchronous write. */ - atomic_inc(&inode->i_dio_count); + if (is_sync_kiocb(iocb)) + dio->is_async = false; + else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) + dio->is_async = false; + else + dio->is_async = true; + + dio->inode = inode; + if (iov_iter_rw(iter) == WRITE) { + dio->opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (iocb->ki_flags & IOCB_NOWAIT) + dio->opf |= REQ_NOWAIT; + } else { + dio->opf = REQ_OP_READ; + } /* - * For file extending writes updating i_size before data - * writeouts complete can expose uninitialized blocks. So - * even for AIO, we need to wait for i/o to complete before - * returning in this case. + * For AIO O_(D)SYNC writes we need to defer completions to a workqueue + * so that we can call ->fsync. */ - dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && - (end > i_size_read(inode))); + if (dio->is_async && iov_iter_rw(iter) == WRITE) { + retval = 0; + if (iocb_is_dsync(iocb)) + retval = dio_set_defer_completion(dio); + else if (!dio->inode->i_sb->s_dio_done_wq) { + /* + * In case of AIO write racing with buffered read we + * need to defer completion. We can't decide this now, + * however the workqueue needs to be initialized here. + */ + retval = sb_init_dio_done_wq(dio->inode->i_sb); + } + if (retval) + goto fail_dio; + } - retval = 0; + /* + * Will be decremented at I/O completion time. + */ + inode_dio_begin(inode); - dio->inode = inode; - dio->rw = rw; sdio.blkbits = blkbits; sdio.blkfactor = i_blkbits - blkbits; sdio.block_in_file = offset >> blkbits; sdio.get_block = get_block; dio->end_io = end_io; - sdio.submit_io = submit_io; sdio.final_block_in_bio = -1; sdio.next_block_for_io = -1; dio->iocb = iocb; - dio->i_size = i_size_read(inode); spin_lock_init(&dio->bio_lock); dio->refcount = 1; + dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ; + sdio.iter = iter; + sdio.final_block_in_request = end >> blkbits; + /* * In case of non-aligned buffers, we may need 2 more * pages since we need to zero out first and last block. @@ -1169,52 +1236,18 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (unlikely(sdio.blkfactor)) sdio.pages_in_io = 2; - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio.pages_in_io += - ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / - PAGE_SIZE - user_addr / PAGE_SIZE); - } + sdio.pages_in_io += iov_iter_npages(iter, INT_MAX); blk_start_plug(&plug); - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio.size += bytes = iov[seg].iov_len; - - /* Index into the first page of the first block */ - sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - sdio.final_block_in_request = sdio.block_in_file + - (bytes >> blkbits); - /* Page fetching state */ - sdio.head = 0; - sdio.tail = 0; - sdio.curr_page = 0; - - sdio.total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { - sdio.total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); - } - sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - sdio.curr_user_address = user_addr; - - retval = do_direct_IO(dio, &sdio, &map_bh); - - dio->result += iov[seg].iov_len - - ((sdio.final_block_in_request - sdio.block_in_file) << - blkbits); - - if (retval) { - dio_cleanup(dio, &sdio); - break; - } - } /* end iovec loop */ + retval = do_direct_IO(dio, &sdio, &map_bh); + if (retval) + dio_cleanup(dio, &sdio); if (retval == -ENOTBLK) { /* * The remaining part of the request will be - * be handled by buffered I/O when we return + * handled by buffered I/O when we return */ retval = 0; } @@ -1230,7 +1263,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ret2 = dio_send_cur_page(dio, &sdio, &map_bh); if (retval == 0) retval = ret2; - page_cache_release(sdio.cur_page); + dio_unpin_page(dio, sdio.cur_page); sdio.cur_page = NULL; } if (sdio.bio) @@ -1246,11 +1279,11 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, /* * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose + * we can let i_rwsem go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ - if (rw == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); + if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) + inode_unlock(dio->inode); /* * The only time we want to leave bios in flight is when a successful @@ -1261,45 +1294,25 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - ((rw == READ) || (dio->result == sdio.size))) + (iov_iter_rw(iter) == READ || dio->result == count)) retval = -EIOCBQUEUED; - - if (retval != -EIOCBQUEUED) + else dio_await_completion(dio); if (drop_refcount(dio) == 0) { - retval = dio_complete(dio, offset, retval, false); - kmem_cache_free(dio_cache, dio); + retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE); } else BUG_ON(retval != -EIOCBQUEUED); -out: return retval; -} -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) -{ - /* - * The block device state is needed in the end to finally - * submit everything. Since it's likely to be cache cold - * prefetch it here as first thing to hide some of the - * latency. - * - * Attempt to prefetch the pieces we likely need later. - */ - prefetch(&bdev->bd_disk->part_tbl); - prefetch(bdev->bd_queue); - prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); +fail_dio: + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) + inode_unlock(inode); - return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, - submit_io, flags); + kmem_cache_free(dio_cache, dio); + return retval; } - EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) |
