summaryrefslogtreecommitdiff
path: root/block/fops.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/fops.c')
-rw-r--r--block/fops.c240
1 files changed, 169 insertions, 71 deletions
diff --git a/block/fops.c b/block/fops.c
index 2d01c9007681..4d32785b31d9 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/uio.h>
@@ -38,8 +39,15 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
struct iov_iter *iter)
{
- return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
- !bdev_iter_is_aligned(bdev, iter);
+ return (iocb->ki_pos | iov_iter_count(iter)) &
+ (bdev_logical_block_size(bdev) - 1);
+}
+
+static inline int blkdev_iov_iter_get_pages(struct bio *bio,
+ struct iov_iter *iter, struct block_device *bdev)
+{
+ return bio_iov_iter_get_pages(bio, iter,
+ bdev_logical_block_size(bdev) - 1);
}
#define DIO_INLINE_BIO_VECS 4
@@ -72,11 +80,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
- ret = bio_iov_iter_get_pages(&bio, iter);
+ ret = blkdev_iov_iter_get_pages(&bio, iter, bdev);
if (unlikely(ret))
goto out;
ret = bio.bi_iter.bi_size;
@@ -124,12 +133,16 @@ static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
+ bool is_sync = dio->flags & DIO_IS_SYNC;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
+ if (bio_integrity(bio))
+ bio_integrity_unmap_user(bio);
+
if (atomic_dec_and_test(&dio->ref)) {
- if (!(dio->flags & DIO_IS_SYNC)) {
+ if (!is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@@ -171,8 +184,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (iocb->ki_flags & IOCB_ALLOC_CACHE)
- opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
&blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
@@ -201,11 +212,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
- ret = bio_iov_iter_get_pages(bio, iter);
+ ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
@@ -221,14 +233,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* a retry of this from blocking context.
*/
if (unlikely(iov_iter_count(iter))) {
- bio_release_pages(bio, false);
- bio_clear_flag(bio, BIO_REFFED);
- bio_put(bio);
- blk_finish_plug(&plug);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto fail;
}
bio->bi_opf |= REQ_NOWAIT;
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ if (unlikely(ret))
+ goto fail;
+ }
if (is_read) {
if (dio->flags & DIO_SHOULD_DIRTY)
@@ -269,6 +283,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio_put(&dio->bio);
return ret;
+fail:
+ bio_release_pages(bio, false);
+ bio_clear_flag(bio, BIO_REFFED);
+ bio_put(bio);
+ blk_finish_plug(&plug);
+ return ret;
}
static void blkdev_bio_end_io_async(struct bio *bio)
@@ -286,6 +306,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
+ if (bio_integrity(bio))
+ bio_integrity_unmap_user(bio);
+
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
@@ -308,8 +331,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (iocb->ki_flags & IOCB_ALLOC_CACHE)
- opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
&blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
@@ -317,6 +338,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -329,11 +351,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
*/
bio_iov_bvec_set(bio, iter);
} else {
- ret = bio_iov_iter_get_pages(bio, iter);
- if (unlikely(ret)) {
- bio_put(bio);
- return ret;
- }
+ ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
+ if (unlikely(ret))
+ goto out_bio_put;
}
dio->size = bio->bi_iter.bi_size;
@@ -346,6 +366,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ WRITE_ONCE(iocb->private, NULL);
+ if (unlikely(ret))
+ goto out_bio_put;
+ }
+
if (iocb->ki_flags & IOCB_ATOMIC)
bio->bi_opf |= REQ_ATOMIC;
@@ -360,6 +387,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
submit_bio(bio);
}
return -EIOCBQUEUED;
+
+out_bio_put:
+ bio_put(bio);
+ return ret;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
@@ -373,8 +404,29 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
+ if (iov_iter_rw(iter) == WRITE) {
+ u16 max_write_streams = bdev_max_write_streams(bdev);
+
+ if (iocb->ki_write_stream) {
+ if (iocb->ki_write_stream > max_write_streams)
+ return -EINVAL;
+ } else if (max_write_streams) {
+ enum rw_hint write_hint =
+ file_inode(iocb->ki_filp)->i_write_hint;
+
+ /*
+ * Just use the write hint as write stream for block
+ * device writes. This assumes no file system is
+ * mounted that would use the streams differently.
+ */
+ if (write_hint <= max_write_streams)
+ iocb->ki_write_stream = write_hint;
+ }
+ }
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
- if (likely(nr_pages <= BIO_MAX_VECS)) {
+ if (likely(nr_pages <= BIO_MAX_VECS &&
+ !(iocb->ki_flags & IOCB_HAS_METADATA))) {
if (is_sync_kiocb(iocb))
return __blkdev_direct_IO_simple(iocb, iter, bdev,
nr_pages);
@@ -426,12 +478,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct folio *folio = NULL;
struct blk_plug plug;
int err;
blk_start_plug(&plug);
- err = write_cache_pages(mapping, wbc, block_write_full_folio,
- blkdev_get_block);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err)))
+ err = block_write_full_folio(folio, wbc, blkdev_get_block);
blk_finish_plug(&plug);
return err;
@@ -447,18 +500,21 @@ static void blkdev_readahead(struct readahead_control *rac)
mpage_readahead(rac, blkdev_get_block);
}
-static int blkdev_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int blkdev_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
}
-static int blkdev_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct folio *folio,
- void *fsdata)
+static int blkdev_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = block_write_end(pos, len, copied, folio);
folio_unlock(folio);
folio_put(folio);
@@ -480,38 +536,51 @@ const struct address_space_operations def_blk_aops = {
#else /* CONFIG_BUFFER_HEAD */
static int blkdev_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &blkdev_iomap_ops);
+ iomap_bio_read_folio(folio, &blkdev_iomap_ops);
+ return 0;
}
static void blkdev_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &blkdev_iomap_ops);
+ iomap_bio_readahead(rac, &blkdev_iomap_ops);
}
-static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset, unsigned int len)
+static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(wpc->inode);
if (WARN_ON_ONCE(offset >= isize))
return -EIO;
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
- return blkdev_iomap_begin(inode, offset, isize - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
static const struct iomap_writeback_ops blkdev_writeback_ops = {
- .map_blocks = blkdev_map_blocks,
+ .writeback_range = blkdev_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
};
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct iomap_writepage_ctx wpc = { };
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &blkdev_writeback_ops
+ };
- return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+ return iomap_writepages(&wpc);
}
const struct address_space_operations def_blk_aops = {
@@ -617,12 +686,14 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- bdev = blkdev_get_no_open(inode->i_rdev);
+ bdev = blkdev_get_no_open(inode->i_rdev, true);
if (!bdev)
return -ENXIO;
if (bdev_can_atomic_write(bdev))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+ if (blk_get_integrity(bdev->bd_disk))
+ filp->f_mode |= FMODE_HAS_METADATA;
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
if (ret)
@@ -662,7 +733,8 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
- return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL);
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
+ NULL);
}
/*
@@ -677,6 +749,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct inode *bd_inode = bdev_file_inode(file);
struct block_device *bdev = I_BDEV(bd_inode);
+ bool atomic = iocb->ki_flags & IOCB_ATOMIC;
loff_t size = bdev_nr_bytes(bdev);
size_t shorted = 0;
ssize_t ret;
@@ -696,7 +769,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
- if (iocb->ki_flags & IOCB_ATOMIC) {
+ if (atomic) {
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
@@ -704,6 +777,8 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
size -= iocb->ki_pos;
if (iov_iter_count(from) > size) {
+ if (atomic)
+ return -EINVAL;
shorted = iov_iter_count(from) - size;
iov_iter_truncate(from, size);
}
@@ -718,7 +793,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = direct_write_fallback(iocb, from, ret,
blkdev_buffered_write(iocb, from));
} else {
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with
+ * set_blocksize changing i_blkbits/folio order and punching
+ * out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = blkdev_buffered_write(iocb, from);
+ inode_unlock_shared(bd_inode);
}
if (ret > 0)
@@ -729,6 +811,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
@@ -755,16 +838,23 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
file_accessed(iocb->ki_filp);
ret = blkdev_direct_IO(iocb, to);
- if (ret >= 0) {
+ if (ret > 0) {
iocb->ki_pos += ret;
count -= ret;
}
- iov_iter_revert(to, count - iov_iter_count(to));
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
goto reexpand;
}
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
+ * changing i_blkbits/folio order and punching out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = filemap_read(iocb, to, ret);
+ inode_unlock_shared(bd_inode);
reexpand:
if (unlikely(shorted))
@@ -774,7 +864,7 @@ reexpand:
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE)
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -783,11 +873,19 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
struct block_device *bdev = I_BDEV(inode);
loff_t end = start + len - 1;
loff_t isize;
+ unsigned int flags;
int error;
/* Fail if we don't recognize the flags. */
if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the device does not enable the
+ * unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(bdev))
+ return -EOPNOTSUPP;
/* Don't go off the end of the device. */
isize = bdev_nr_bytes(bdev);
@@ -807,49 +905,49 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
- /*
- * Invalidate the page cache, including dirty pages, for valid
- * de-allocate mode calls to fallocate().
- */
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+ flags = BLKDEV_ZERO_NOUNMAP;
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOFALLBACK);
+ flags = BLKDEV_ZERO_NOFALLBACK;
+ break;
+ case FALLOC_FL_WRITE_ZEROES:
+ flags = 0;
break;
default:
error = -EOPNOTSUPP;
+ goto fail;
}
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL, flags);
fail:
filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
return error;
}
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+static int blkdev_mmap_prepare(struct vm_area_desc *desc)
{
- struct inode *bd_inode = bdev_file_inode(file);
+ struct file *file = desc->file;
- if (bdev_read_only(I_BDEV(bd_inode)))
- return generic_file_readonly_mmap(file, vma);
+ if (bdev_read_only(I_BDEV(bdev_file_inode(file))))
+ return generic_file_readonly_mmap_prepare(desc);
- return generic_file_mmap(file, vma);
+ return generic_file_mmap_prepare(desc);
}
const struct file_operations def_blk_fops = {
@@ -859,7 +957,7 @@ const struct file_operations def_blk_fops = {
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.iopoll = iocb_bio_iopoll,
- .mmap = blkdev_mmap,
+ .mmap_prepare = blkdev_mmap_prepare,
.fsync = blkdev_fsync,
.unlocked_ioctl = blkdev_ioctl,
#ifdef CONFIG_COMPAT