summaryrefslogtreecommitdiff
path: root/block/fops.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/fops.c')
-rw-r--r--block/fops.c560
1 files changed, 421 insertions, 139 deletions
diff --git a/block/fops.c b/block/fops.c
index b90742595317..4d32785b31d9 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/uio.h>
@@ -15,7 +16,9 @@
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
+#include <linux/iomap.h>
#include <linux/module.h>
+#include <linux/io_uring/cmd.h>
#include "blk.h"
static inline struct inode *bdev_file_inode(struct file *file)
@@ -23,15 +26,6 @@ static inline struct inode *bdev_file_inode(struct file *file)
return file->f_mapping->host;
}
-static int blkdev_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- set_buffer_mapped(bh);
- return 0;
-}
-
static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{
blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
@@ -42,28 +36,32 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
return opf;
}
-static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
- struct iov_iter *iter)
+static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ return (iocb->ki_pos | iov_iter_count(iter)) &
+ (bdev_logical_block_size(bdev) - 1);
+}
+
+static inline int blkdev_iov_iter_get_pages(struct bio *bio,
+ struct iov_iter *iter, struct block_device *bdev)
{
- return pos & (bdev_logical_block_size(bdev) - 1) ||
- !bdev_iter_is_aligned(bdev, iter);
+ return bio_iov_iter_get_pages(bio, iter,
+ bdev_logical_block_size(bdev) - 1);
}
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- struct iov_iter *iter, unsigned int nr_pages)
+ struct iov_iter *iter, struct block_device *bdev,
+ unsigned int nr_pages)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -81,9 +79,13 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ bio.bi_opf |= REQ_ATOMIC;
- ret = bio_iov_iter_get_pages(&bio, iter);
+ ret = blkdev_iov_iter_get_pages(&bio, iter, bdev);
if (unlikely(ret))
goto out;
ret = bio.bi_iter.bi_size;
@@ -131,12 +133,16 @@ static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
+ bool is_sync = dio->flags & DIO_IS_SYNC;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
+ if (bio_integrity(bio))
+ bio_integrity_unmap_user(bio);
+
if (atomic_dec_and_test(&dio->ref)) {
- if (!(dio->flags & DIO_IS_SYNC)) {
+ if (!is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@@ -168,9 +174,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
+ struct block_device *bdev, unsigned int nr_pages)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
@@ -179,11 +184,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
- if (iocb->ki_flags & IOCB_ALLOC_CACHE)
- opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
&blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
@@ -211,16 +211,38 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
- ret = bio_iov_iter_get_pages(bio, iter);
+ ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
break;
}
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /*
+ * This is nonblocking IO, and we need to allocate
+ * another bio if we have data left to map. As we
+ * cannot guarantee that one of the sub bios will not
+ * fail getting issued FOR NOWAIT and as error results
+ * are coalesced across all of them, be safe and ask for
+ * a retry of this from blocking context.
+ */
+ if (unlikely(iov_iter_count(iter))) {
+ ret = -EAGAIN;
+ goto fail;
+ }
+ bio->bi_opf |= REQ_NOWAIT;
+ }
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ if (unlikely(ret))
+ goto fail;
+ }
if (is_read) {
if (dio->flags & DIO_SHOULD_DIRTY)
@@ -228,9 +250,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
} else {
task_io_account_write(bio->bi_iter.bi_size);
}
- if (iocb->ki_flags & IOCB_NOWAIT)
- bio->bi_opf |= REQ_NOWAIT;
-
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
@@ -264,6 +283,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio_put(&dio->bio);
return ret;
+fail:
+ bio_release_pages(bio, false);
+ bio_clear_flag(bio, BIO_REFFED);
+ bio_put(bio);
+ blk_finish_plug(&plug);
+ return ret;
}
static void blkdev_bio_end_io_async(struct bio *bio)
@@ -281,6 +306,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
+ if (bio_integrity(bio))
+ bio_integrity_unmap_user(bio);
+
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
@@ -293,9 +321,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
+ struct block_device *bdev,
unsigned int nr_pages)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
bool is_read = iov_iter_rw(iter) == READ;
blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
@@ -303,17 +331,14 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
- if (iocb->ki_flags & IOCB_ALLOC_CACHE)
- opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
&blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->flags = 0;
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -326,11 +351,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
*/
bio_iov_bvec_set(bio, iter);
} else {
- ret = bio_iov_iter_get_pages(bio, iter);
- if (unlikely(ret)) {
- bio_put(bio);
- return ret;
- }
+ ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
+ if (unlikely(ret))
+ goto out_bio_put;
}
dio->size = bio->bi_iter.bi_size;
@@ -343,37 +366,128 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ WRITE_ONCE(iocb->private, NULL);
+ if (unlikely(ret))
+ goto out_bio_put;
+ }
+
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ bio->bi_opf |= REQ_ATOMIC;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
+
if (iocb->ki_flags & IOCB_HIPRI) {
- bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
+ bio->bi_opf |= REQ_POLLED;
submit_bio(bio);
WRITE_ONCE(iocb->private, bio);
} else {
- if (iocb->ki_flags & IOCB_NOWAIT)
- bio->bi_opf |= REQ_NOWAIT;
submit_bio(bio);
}
return -EIOCBQUEUED;
+
+out_bio_put:
+ bio_put(bio);
+ return ret;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
+ if (blkdev_dio_invalid(bdev, iocb, iter))
+ return -EINVAL;
+
+ if (iov_iter_rw(iter) == WRITE) {
+ u16 max_write_streams = bdev_max_write_streams(bdev);
+
+ if (iocb->ki_write_stream) {
+ if (iocb->ki_write_stream > max_write_streams)
+ return -EINVAL;
+ } else if (max_write_streams) {
+ enum rw_hint write_hint =
+ file_inode(iocb->ki_filp)->i_write_hint;
+
+ /*
+ * Just use the write hint as write stream for block
+ * device writes. This assumes no file system is
+ * mounted that would use the streams differently.
+ */
+ if (write_hint <= max_write_streams)
+ iocb->ki_write_stream = write_hint;
+ }
+ }
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
- if (likely(nr_pages <= BIO_MAX_VECS)) {
+ if (likely(nr_pages <= BIO_MAX_VECS &&
+ !(iocb->ki_flags & IOCB_HAS_METADATA))) {
if (is_sync_kiocb(iocb))
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_simple(iocb, iter, bdev,
+ nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
+ } else if (iocb->ki_flags & IOCB_ATOMIC) {
+ return -EINVAL;
}
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
+}
+
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct block_device *bdev = I_BDEV(inode);
+ loff_t isize = i_size_read(inode);
+
+ if (offset >= isize)
+ return -EIO;
+
+ iomap->bdev = bdev;
+ iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = iomap->offset;
+ iomap->length = isize - iomap->offset;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
+ return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+ .iomap_begin = blkdev_iomap_begin,
+};
+
+#ifdef CONFIG_BUFFER_HEAD
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ bh->b_bdev = I_BDEV(inode);
+ bh->b_blocknr = iblock;
+ set_buffer_mapped(bh);
+ return 0;
}
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+/*
+ * We cannot call mpage_writepages() as it does not take the buffer lock.
+ * We must use block_write_full_folio() directly which holds the buffer
+ * lock. The buffer lock provides the synchronisation with writeback
+ * that filesystems rely on when they use the blockdev's mapping.
+ */
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, blkdev_get_block, wbc);
+ struct folio *folio = NULL;
+ struct blk_plug plug;
+ int err;
+
+ blk_start_plug(&plug);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err)))
+ err = block_write_full_folio(folio, wbc, blkdev_get_block);
+ blk_finish_plug(&plug);
+
+ return err;
}
static int blkdev_read_folio(struct file *file, struct folio *folio)
@@ -386,44 +500,101 @@ static void blkdev_readahead(struct readahead_control *rac)
mpage_readahead(rac, blkdev_get_block);
}
-static int blkdev_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+static int blkdev_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
- return block_write_begin(mapping, pos, len, pagep, blkdev_get_block);
+ return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
}
-static int blkdev_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct page *page,
- void *fsdata)
+static int blkdev_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ ret = block_write_end(pos, len, copied, folio);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
-static int blkdev_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return generic_writepages(mapping, wbc);
-}
-
const struct address_space_operations def_blk_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = blkdev_read_folio,
.readahead = blkdev_readahead,
- .writepage = blkdev_writepage,
+ .writepages = blkdev_writepages,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
- .writepages = blkdev_writepages,
- .direct_IO = blkdev_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#else /* CONFIG_BUFFER_HEAD */
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+ iomap_bio_read_folio(folio, &blkdev_iomap_ops);
+ return 0;
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+ iomap_bio_readahead(rac, &blkdev_iomap_ops);
+}
+
+static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
+{
+ loff_t isize = i_size_read(wpc->inode);
+
+ if (WARN_ON_ONCE(offset >= isize))
+ return -EIO;
+
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+}
+
+static const struct iomap_writeback_ops blkdev_writeback_ops = {
+ .writeback_range = blkdev_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
+};
+
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &blkdev_writeback_ops
+ };
+
+ return iomap_writepages(&wpc);
+}
+
+const struct address_space_operations def_blk_aops = {
+ .dirty_folio = filemap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .read_folio = blkdev_read_folio,
+ .readahead = blkdev_readahead,
+ .writepages = blkdev_writepages,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_folio = generic_error_remove_folio,
+ .migrate_folio = filemap_migrate_folio,
+};
+#endif /* CONFIG_BUFFER_HEAD */
/*
* for a block special file file_inode(file)->i_size is zero
@@ -443,7 +614,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
int datasync)
{
- struct block_device *bdev = filp->private_data;
+ struct block_device *bdev = I_BDEV(filp->f_mapping->host);
int error;
error = file_write_and_wait_range(filp, start, end);
@@ -462,44 +633,110 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
return error;
}
-static int blkdev_open(struct inode *inode, struct file *filp)
+/**
+ * file_to_blk_mode - get block open flags from file flags
+ * @file: file whose open flags should be converted
+ *
+ * Look at file open flags and generate corresponding block open flags from
+ * them. The function works both for file just being open (e.g. during ->open
+ * callback) and for file that is already open. This is actually non-trivial
+ * (see comment in the function).
+ */
+blk_mode_t file_to_blk_mode(struct file *file)
{
- struct block_device *bdev;
+ blk_mode_t mode = 0;
+ if (file->f_mode & FMODE_READ)
+ mode |= BLK_OPEN_READ;
+ if (file->f_mode & FMODE_WRITE)
+ mode |= BLK_OPEN_WRITE;
/*
- * Preserve backwards compatibility and allow large file access
- * even if userspace doesn't ask for it explicitly. Some mkfs
- * binary needs it. We might want to drop this workaround
- * during an unstable branch.
+ * do_dentry_open() clears O_EXCL from f_flags, use file->private_data
+ * to determine whether the open was exclusive for already open files.
*/
- filp->f_flags |= O_LARGEFILE;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
-
- if (filp->f_flags & O_NDELAY)
- filp->f_mode |= FMODE_NDELAY;
- if (filp->f_flags & O_EXCL)
- filp->f_mode |= FMODE_EXCL;
- if ((filp->f_flags & O_ACCMODE) == 3)
- filp->f_mode |= FMODE_WRITE_IOCTL;
-
- bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
-
- filp->private_data = bdev;
- filp->f_mapping = bdev->bd_inode->i_mapping;
- filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
- return 0;
+ if (file->private_data)
+ mode |= BLK_OPEN_EXCL;
+ else if (file->f_flags & O_EXCL)
+ mode |= BLK_OPEN_EXCL;
+ if (file->f_flags & O_NDELAY)
+ mode |= BLK_OPEN_NDELAY;
+
+ /*
+ * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy
+ * driver has historically allowed ioctls as if the file was opened for
+ * writing, but does not allow and actual reads or writes.
+ */
+ if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY))
+ mode |= BLK_OPEN_WRITE_IOCTL;
+
+ return mode;
}
-static int blkdev_close(struct inode *inode, struct file *filp)
+static int blkdev_open(struct inode *inode, struct file *filp)
{
- struct block_device *bdev = filp->private_data;
+ struct block_device *bdev;
+ blk_mode_t mode;
+ int ret;
+
+ mode = file_to_blk_mode(filp);
+ /* Use the file as the holder. */
+ if (mode & BLK_OPEN_EXCL)
+ filp->private_data = filp;
+ ret = bdev_permission(inode->i_rdev, mode, filp->private_data);
+ if (ret)
+ return ret;
+
+ bdev = blkdev_get_no_open(inode->i_rdev, true);
+ if (!bdev)
+ return -ENXIO;
+
+ if (bdev_can_atomic_write(bdev))
+ filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+ if (blk_get_integrity(bdev->bd_disk))
+ filp->f_mode |= FMODE_HAS_METADATA;
+
+ ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
+ if (ret)
+ blkdev_put_no_open(bdev);
+ return ret;
+}
- blkdev_put(bdev, filp->f_mode);
+static int blkdev_release(struct inode *inode, struct file *filp)
+{
+ bdev_release(filp);
return 0;
}
+static ssize_t
+blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ size_t count = iov_iter_count(from);
+ ssize_t written;
+
+ written = kiocb_invalidate_pages(iocb, count);
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ return written;
+ }
+
+ written = blkdev_direct_IO(iocb, from);
+ if (written > 0) {
+ kiocb_invalidate_post_direct_write(iocb, count);
+ iocb->ki_pos += written;
+ count -= written;
+ }
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, count - iov_iter_count(from));
+ return written;
+}
+
+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
+ NULL);
+}
+
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -509,10 +746,11 @@ static int blkdev_close(struct inode *inode, struct file *filp)
*/
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
- struct inode *bd_inode = bdev->bd_inode;
+ struct file *file = iocb->ki_filp;
+ struct inode *bd_inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+ bool atomic = iocb->ki_flags & IOCB_ATOMIC;
loff_t size = bdev_nr_bytes(bdev);
- struct blk_plug plug;
size_t shorted = 0;
ssize_t ret;
@@ -531,24 +769,50 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
+ if (atomic) {
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
size -= iocb->ki_pos;
if (iov_iter_count(from) > size) {
+ if (atomic)
+ return -EINVAL;
shorted = iov_iter_count(from) - size;
iov_iter_truncate(from, size);
}
- blk_start_plug(&plug);
- ret = __generic_file_write_iter(iocb, from);
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = blkdev_direct_write(iocb, from);
+ if (ret >= 0 && iov_iter_count(from))
+ ret = direct_write_fallback(iocb, from, ret,
+ blkdev_buffered_write(iocb, from));
+ } else {
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with
+ * set_blocksize changing i_blkbits/folio order and punching
+ * out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
+ ret = blkdev_buffered_write(iocb, from);
+ inode_unlock_shared(bd_inode);
+ }
+
if (ret > 0)
ret = generic_write_sync(iocb, ret);
iov_iter_reexpand(from, iov_iter_count(from) + shorted);
- blk_finish_plug(&plug);
return ret;
}
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
+ struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
size_t shorted = 0;
@@ -568,34 +832,29 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto reexpand; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
- struct address_space *mapping = iocb->ki_filp->f_mapping;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_needs_writeback(mapping, pos,
- pos + count - 1)) {
- ret = -EAGAIN;
- goto reexpand;
- }
- } else {
- ret = filemap_write_and_wait_range(mapping, pos,
- pos + count - 1);
- if (ret < 0)
- goto reexpand;
- }
-
+ ret = kiocb_write_and_wait(iocb, count);
+ if (ret < 0)
+ goto reexpand;
file_accessed(iocb->ki_filp);
ret = blkdev_direct_IO(iocb, to);
- if (ret >= 0) {
+ if (ret > 0) {
iocb->ki_pos += ret;
count -= ret;
}
- iov_iter_revert(to, count - iov_iter_count(to));
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
goto reexpand;
}
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
+ * changing i_blkbits/folio order and punching out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = filemap_read(iocb, to, ret);
+ inode_unlock_shared(bd_inode);
reexpand:
if (unlikely(shorted))
@@ -605,7 +864,7 @@ reexpand:
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -614,11 +873,19 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
struct block_device *bdev = I_BDEV(inode);
loff_t end = start + len - 1;
loff_t isize;
+ unsigned int flags;
int error;
/* Fail if we don't recognize the flags. */
if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the device does not enable the
+ * unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(bdev))
+ return -EOPNOTSUPP;
/* Don't go off the end of the device. */
isize = bdev_nr_bytes(bdev);
@@ -638,54 +905,69 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
- /* Invalidate the page cache, including dirty pages. */
- error = truncate_bdev_range(bdev, file->f_mode, start, end);
- if (error)
- goto fail;
-
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+ flags = BLKDEV_ZERO_NOUNMAP;
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOFALLBACK);
+ flags = BLKDEV_ZERO_NOFALLBACK;
break;
- case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL);
+ case FALLOC_FL_WRITE_ZEROES:
+ flags = 0;
break;
default:
error = -EOPNOTSUPP;
+ goto fail;
}
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL, flags);
fail:
filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
return error;
}
+static int blkdev_mmap_prepare(struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
+
+ if (bdev_read_only(I_BDEV(bdev_file_inode(file))))
+ return generic_file_readonly_mmap_prepare(desc);
+
+ return generic_file_mmap_prepare(desc);
+}
+
const struct file_operations def_blk_fops = {
.open = blkdev_open,
- .release = blkdev_close,
+ .release = blkdev_release,
.llseek = blkdev_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.iopoll = iocb_bio_iopoll,
- .mmap = generic_file_mmap,
+ .mmap_prepare = blkdev_mmap_prepare,
.fsync = blkdev_fsync,
.unlocked_ioctl = blkdev_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_blkdev_ioctl,
#endif
- .splice_read = generic_file_splice_read,
+ .splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
+ .uring_cmd = blkdev_uring_cmd,
+ .fop_flags = FOP_BUFFER_RASYNC,
};
static __init int blkdev_init(void)