diff options
Diffstat (limited to 'block')
81 files changed, 7581 insertions, 6561 deletions
diff --git a/block/Kconfig b/block/Kconfig index 1de4682d48cc..15027963472d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -62,6 +62,8 @@ config BLK_DEV_BSGLIB config BLK_DEV_INTEGRITY bool "Block layer data integrity support" + select CRC_T10DIF + select CRC64 help Some storage devices allow extra information to be stored/retrieved to help protect the data. The block layer @@ -72,12 +74,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_DEV_INTEGRITY_T10 - tristate - depends on BLK_DEV_INTEGRITY - select CRC_T10DIF - select CRC64_ROCKSOFT - config BLK_DEV_WRITE_MOUNTED bool "Allow writing to mounted block devices" default y @@ -100,7 +96,6 @@ config BLK_DEV_WRITE_MOUNTED config BLK_DEV_ZONED bool "Zoned block device support" - select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables support for ZAC/ZBC/ZNS host-managed and host-aware zoned block @@ -120,17 +115,6 @@ config BLK_DEV_THROTTLING See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. -config BLK_DEV_THROTTLING_LOW - bool "Block throttling .low limit interface support (EXPERIMENTAL)" - depends on BLK_DEV_THROTTLING - help - Add .low limit interface for block throttling. The low limit is a best - effort limit to prioritize cgroups. Depending on the setting, the limit - can be used to protect cgroups in terms of bandwidth/iops and better - utilize disk resource. - - Note, this is an experimental interface and could be changed someday. - config BLK_WBT bool "Enable support for block device writeback throttling" help @@ -198,10 +182,6 @@ config BLK_DEBUG_FS Unless you are building a kernel for a tiny system, you should say Y here. -config BLK_DEBUG_FS_ZONED - bool - default BLK_DEBUG_FS && BLK_DEV_ZONED - config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" depends on KEYS @@ -231,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK source "block/partitions/Kconfig" -config BLK_MQ_PCI - def_bool PCI - -config BLK_MQ_VIRTIO - bool - depends on VIRTIO - default y - config BLK_PM def_bool PM diff --git a/block/Makefile b/block/Makefile index 46ada9dc8bbf..c65f4da93702 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,13 +5,12 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-merge.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ + blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \ + blk-mq-tag.o blk-mq-dma.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ disk-events.o blk-ia-ranges.o early-lookup.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o @@ -26,14 +25,11 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o -obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o -obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o -obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \ + bio-integrity-auto.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o -obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ diff --git a/block/badblocks.c b/block/badblocks.c index db4ec8b9b2a8..ece64e76fe8f 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -528,51 +528,6 @@ out: } /* - * Return 'true' if the range indicated by 'bad' can be backward merged - * with the bad range (from the bad table) index by 'behind'. - */ -static bool can_merge_behind(struct badblocks *bb, - struct badblocks_context *bad, int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - - if ((s < BB_OFFSET(p[behind])) && - ((s + sectors) >= BB_OFFSET(p[behind])) && - ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && - BB_ACK(p[behind]) == bad->ack) - return true; - return false; -} - -/* - * Do backward merge for range indicated by 'bad' and the bad range - * (from the bad table) indexed by 'behind'. The return value is merged - * sectors from bad->len. - */ -static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, - int behind) -{ - sector_t sectors = bad->len; - sector_t s = bad->start; - u64 *p = bb->page; - int merged = 0; - - WARN_ON(s >= BB_OFFSET(p[behind])); - WARN_ON((s + sectors) < BB_OFFSET(p[behind])); - - if (s < BB_OFFSET(p[behind])) { - merged = BB_OFFSET(p[behind]) - s; - p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); - - WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); - } - - return merged; -} - -/* * Return 'true' if the range indicated by 'bad' can be forward * merged with the bad range (from the bad table) indexed by 'prev'. */ @@ -745,7 +700,7 @@ static bool can_front_overwrite(struct badblocks *bb, int prev, *extra = 2; } - if ((bb->count + (*extra)) >= MAX_BADBLOCKS) + if ((bb->count + (*extra)) > MAX_BADBLOCKS) return false; return true; @@ -855,40 +810,60 @@ static void badblocks_update_acked(struct badblocks *bb) bb->unacked_exist = 0; } +/* + * Return 'true' if the range indicated by 'bad' is exactly backward + * overlapped with the bad range (from bad table) indexed by 'behind'. + */ +static bool try_adjacent_combine(struct badblocks *bb, int prev) +{ + u64 *p = bb->page; + + if (prev >= 0 && (prev + 1) < bb->count && + BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && + (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && + BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), + BB_ACK(p[prev])); + + if ((prev + 2) < bb->count) + memmove(p + prev + 1, p + prev + 2, + (bb->count - (prev + 2)) * 8); + bb->count--; + return true; + } + return false; +} + /* Do exact work to set bad block range into the bad block table */ -static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, + int acknowledged) { - int retried = 0, space_desired = 0; - int orig_len, len = 0, added = 0; + int len = 0, added = 0; struct badblocks_context bad; int prev = -1, hint = -1; - sector_t orig_start; unsigned long flags; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { /* round the start down, and the end up */ sector_t next = s + sectors; - rounddown(s, bb->shift); - roundup(next, bb->shift); + rounddown(s, 1 << bb->shift); + roundup(next, 1 << bb->shift); sectors = next - s; } write_seqlock_irqsave(&bb->lock, flags); - orig_start = s; - orig_len = sectors; bad.ack = acknowledged; p = bb->page; @@ -897,6 +872,9 @@ re_insert: bad.len = sectors; len = 0; + if (badblocks_full(bb)) + goto out; + if (badblocks_empty(bb)) { len = insert_at(bb, 0, &bad); bb->count++; @@ -908,32 +886,14 @@ re_insert: /* start before all badblocks */ if (prev < 0) { - if (!badblocks_full(bb)) { - /* insert on the first */ - if (bad.len > (BB_OFFSET(p[0]) - bad.start)) - bad.len = BB_OFFSET(p[0]) - bad.start; - len = insert_at(bb, 0, &bad); - bb->count++; - added++; - hint = 0; - goto update_sectors; - } - - /* No sapce, try to merge */ - if (overlap_behind(bb, &bad, 0)) { - if (can_merge_behind(bb, &bad, 0)) { - len = behind_merge(bb, &bad, 0); - added++; - } else { - len = BB_OFFSET(p[0]) - s; - space_desired = 1; - } - hint = 0; - goto update_sectors; - } - - /* no table space and give up */ - goto out; + /* insert on the first */ + if (bad.len > (BB_OFFSET(p[0]) - bad.start)) + bad.len = BB_OFFSET(p[0]) - bad.start; + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + hint = ++prev; + goto update_sectors; } /* in case p[prev-1] can be merged with p[prev] */ @@ -945,33 +905,6 @@ re_insert: goto update_sectors; } - if (overlap_front(bb, prev, &bad)) { - if (can_merge_front(bb, prev, &bad)) { - len = front_merge(bb, prev, &bad); - added++; - } else { - int extra = 0; - - if (!can_front_overwrite(bb, prev, &bad, &extra)) { - len = min_t(sector_t, - BB_END(p[prev]) - s, sectors); - hint = prev; - goto update_sectors; - } - - len = front_overwrite(bb, prev, &bad, extra); - added++; - bb->count += extra; - - if (can_combine_front(bb, prev, &bad)) { - front_combine(bb, prev); - bb->count--; - } - } - hint = prev; - goto update_sectors; - } - if (can_merge_front(bb, prev, &bad)) { len = front_merge(bb, prev, &bad); added++; @@ -979,21 +912,29 @@ re_insert: goto update_sectors; } - /* if no space in table, still try to merge in the covered range */ - if (badblocks_full(bb)) { - /* skip the cannot-merge range */ - if (((prev + 1) < bb->count) && - overlap_behind(bb, &bad, prev + 1) && - ((s + sectors) >= BB_END(p[prev + 1]))) { - len = BB_END(p[prev + 1]) - s; - hint = prev + 1; + if (overlap_front(bb, prev, &bad)) { + int extra = 0; + + if (!can_front_overwrite(bb, prev, &bad, &extra)) { + if (extra > 0) + goto out; + + len = min_t(sector_t, + BB_END(p[prev]) - s, sectors); + hint = prev; goto update_sectors; } - /* no retry any more */ - len = sectors; - space_desired = 1; - hint = -1; + len = front_overwrite(bb, prev, &bad, extra); + added++; + bb->count += extra; + + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; + } + + hint = prev; goto update_sectors; } @@ -1006,7 +947,7 @@ re_insert: len = insert_at(bb, prev + 1, &bad); bb->count++; added++; - hint = prev + 1; + hint = ++prev; update_sectors: s += len; @@ -1015,35 +956,12 @@ update_sectors: if (sectors > 0) goto re_insert; - WARN_ON(sectors < 0); - /* * Check whether the following already set range can be * merged. (prev < 0) condition is not handled here, * because it's already complicated enough. */ - if (prev >= 0 && - (prev + 1) < bb->count && - BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && - (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && - BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { - p[prev] = BB_MAKE(BB_OFFSET(p[prev]), - BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), - BB_ACK(p[prev])); - - if ((prev + 2) < bb->count) - memmove(p + prev + 1, p + prev + 2, - (bb->count - (prev + 2)) * 8); - bb->count--; - } - - if (space_desired && !badblocks_full(bb)) { - s = orig_start; - sectors = orig_len; - space_desired = 0; - if (retried++ < 3) - goto re_insert; - } + try_adjacent_combine(bb, prev); out: if (added) { @@ -1057,10 +975,7 @@ out: write_sequnlock_irqrestore(&bb->lock, flags); - if (!added) - rv = 1; - - return rv; + return sectors == 0; } /* @@ -1131,21 +1046,20 @@ static int front_splitting_clear(struct badblocks *bb, int prev, } /* Do the exact work to clear bad block range from the bad block table */ -static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { struct badblocks_context bad; int prev = -1, hint = -1; int len = 0, cleared = 0; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { sector_t target; @@ -1157,8 +1071,8 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) * isn't than to think a block is not bad when it is. */ target = s + sectors; - roundup(s, bb->shift); - rounddown(target, bb->shift); + roundup(s, 1 << bb->shift); + rounddown(target, 1 << bb->shift); sectors = target - s; } @@ -1214,7 +1128,7 @@ re_clear: if ((BB_OFFSET(p[prev]) < bad.start) && (BB_END(p[prev]) > (bad.start + bad.len))) { /* Splitting */ - if ((bb->count + 1) < MAX_BADBLOCKS) { + if ((bb->count + 1) <= MAX_BADBLOCKS) { len = front_splitting_clear(bb, prev, &bad); bb->count += 1; cleared++; @@ -1255,8 +1169,6 @@ update_sectors: if (sectors > 0) goto re_clear; - WARN_ON(sectors < 0); - if (cleared) { badblocks_update_acked(bb); set_changed(bb); @@ -1265,40 +1177,21 @@ update_sectors: write_sequnlock_irq(&bb->lock); if (!cleared) - rv = 1; + return false; - return rv; + return true; } /* Do the exact work to check bad blocks range from the bad block table */ -static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { - int unacked_badblocks, acked_badblocks; int prev = -1, hint = -1, set = 0; struct badblocks_context bad; - unsigned int seq; + int unacked_badblocks = 0; + int acked_badblocks = 0; + u64 *p = bb->page; int len, rv; - u64 *p; - - WARN_ON(bb->shift < 0 || sectors == 0); - - if (bb->shift > 0) { - sector_t target; - - /* round the start down, and the end up */ - target = s + sectors; - rounddown(s, bb->shift); - roundup(target, bb->shift); - sectors = target - s; - } - -retry: - seq = read_seqbegin(&bb->lock); - - p = bb->page; - unacked_badblocks = 0; - acked_badblocks = 0; re_check: bad.start = s; @@ -1349,14 +1242,15 @@ re_check: len = sectors; update_sectors: + /* This situation should never happen */ + WARN_ON(sectors < len); + s += len; sectors -= len; if (sectors > 0) goto re_check; - WARN_ON(sectors < 0); - if (unacked_badblocks > 0) rv = -1; else if (acked_badblocks > 0) @@ -1364,9 +1258,6 @@ update_sectors: else rv = 0; - if (read_seqretry(&bb->lock, seq)) - goto retry; - return rv; } @@ -1404,10 +1295,30 @@ update_sectors: * -1: there are bad blocks which have not yet been acknowledged in metadata. * plus the start/length of the first bad section we overlap. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { - return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + unsigned int seq; + int rv; + + WARN_ON(bb->shift < 0 || sectors == 0); + + if (bb->shift > 0) { + /* round the start down, and the end up */ + sector_t target = s + sectors; + + rounddown(s, 1 << bb->shift); + roundup(target, 1 << bb->shift); + sectors = target - s; + } + +retry: + seq = read_seqbegin(&bb->lock); + rv = _badblocks_check(bb, s, sectors, first_bad, bad_sectors); + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; } EXPORT_SYMBOL_GPL(badblocks_check); @@ -1423,11 +1334,12 @@ EXPORT_SYMBOL_GPL(badblocks_check); * decide how best to handle it. * * Return: - * 0: success - * 1: failed to set badblocks (out of space) + * true: success + * false: failed to set badblocks (out of space). Parital setting will be + * treated as failure. */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, + int acknowledged) { return _badblocks_set(bb, s, sectors, acknowledged); } @@ -1444,10 +1356,10 @@ EXPORT_SYMBOL_GPL(badblocks_set); * drop the remove request. * * Return: - * 0: success - * 1: failed to clear badblocks + * true: success + * false: failed to clear badblocks */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { return _badblocks_clear(bb, s, sectors); } @@ -1479,6 +1391,11 @@ void ack_all_badblocks(struct badblocks *bb) p[i] = BB_MAKE(start, len, 1); } } + + for (i = 0; i < bb->count ; i++) + while (try_adjacent_combine(bb, i)) + ; + bb->unacked_exist = 0; } write_sequnlock_irq(&bb->lock); @@ -1564,10 +1481,10 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, return -EINVAL; } - if (badblocks_set(bb, sector, length, !unack)) + if (!badblocks_set(bb, sector, length, !unack)) return -ENOSPC; - else - return len; + + return len; } EXPORT_SYMBOL_GPL(badblocks_store); diff --git a/block/bdev.c b/block/bdev.c index da2a167a4d08..b77ddd12dc06 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -24,6 +24,7 @@ #include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> +#include <linux/security.h> #include <linux/part_stat.h> #include <linux/uaccess.h> #include <linux/stat.h> @@ -43,6 +44,11 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode) return container_of(inode, struct bdev_inode, vfs_inode); } +static inline struct inode *BD_INODE(struct block_device *bdev) +{ + return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; +} + struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; @@ -57,7 +63,7 @@ EXPORT_SYMBOL(file_bdev); static void bdev_write_inode(struct block_device *bdev) { - struct inode *inode = bdev->bd_inode; + struct inode *inode = BD_INODE(bdev); int ret; spin_lock(&inode->i_lock); @@ -76,7 +82,7 @@ static void bdev_write_inode(struct block_device *bdev) /* Kill _all_ buffers and pagecache , dirty or not.. */ static void kill_bdev(struct block_device *bdev) { - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; if (mapping_empty(mapping)) return; @@ -88,7 +94,7 @@ static void kill_bdev(struct block_device *bdev) /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) { - struct address_space *mapping = bdev->bd_inode->i_mapping; + struct address_space *mapping = bdev->bd_mapping; if (mapping->nrpages) { invalidate_bh_lrus(); @@ -116,7 +122,7 @@ int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, goto invalidate; } - truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; @@ -126,7 +132,7 @@ invalidate: * Someone else has handle exclusively open. Try invalidating instead. * The 'end' argument is inclusive so the rounding is safe. */ - return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, + return invalidate_inode_pages2_range(bdev->bd_mapping, lstart >> PAGE_SHIFT, lend >> PAGE_SHIFT); } @@ -134,31 +140,77 @@ invalidate: static void set_init_blocksize(struct block_device *bdev) { unsigned int bsize = bdev_logical_block_size(bdev); - loff_t size = i_size_read(bdev->bd_inode); + loff_t size = i_size_read(BD_INODE(bdev)); while (bsize < PAGE_SIZE) { if (size & bsize) break; bsize <<= 1; } - bdev->bd_inode->i_blkbits = blksize_bits(bsize); + BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); + mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, + get_order(bsize)); } -int set_blocksize(struct block_device *bdev, int size) +/** + * bdev_validate_blocksize - check that this block size is acceptable + * @bdev: blockdevice to check + * @block_size: block size to check + * + * For block device users that do not use buffer heads or the block device + * page cache, make sure that this block size can be used with the device. + * + * Return: On success zero is returned, negative error code on failure. + */ +int bdev_validate_blocksize(struct block_device *bdev, int block_size) { - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + if (blk_validate_block_size(block_size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ - if (size < bdev_logical_block_size(bdev)) + if (block_size < bdev_logical_block_size(bdev)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(bdev_validate_blocksize); + +int set_blocksize(struct file *file, int size) +{ + struct inode *inode = file->f_mapping->host; + struct block_device *bdev = I_BDEV(inode); + int ret; + + ret = bdev_validate_blocksize(bdev, size); + if (ret) + return ret; + + if (!file->private_data) return -EINVAL; /* Don't change the size if it is same as current */ - if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { + if (inode->i_blkbits != blksize_bits(size)) { + /* + * Flush and truncate the pagecache before we reconfigure the + * mapping geometry because folio sizes are variable now. If a + * reader has already allocated a folio whose size is smaller + * than the new min_order but invokes readahead after the new + * min_order becomes visible, readahead will think there are + * "zero" blocks per folio and crash. Take the inode and + * invalidation locks to avoid racing with + * read/write/fallocate. + */ + inode_lock(inode); + filemap_invalidate_lock(inode->i_mapping); + sync_blockdev(bdev); - bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); + + inode->i_blkbits = blksize_bits(size); + mapping_set_folio_min_order(inode->i_mapping, get_order(size)); + kill_bdev(bdev); + filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); } return 0; } @@ -167,10 +219,11 @@ EXPORT_SYMBOL(set_blocksize); int sb_set_blocksize(struct super_block *sb, int size) { - if (set_blocksize(sb->s_bdev, size)) + if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE) return 0; - /* If we get here, we know size is power of two - * and it's value is between 512 and PAGE_SIZE */ + if (set_blocksize(sb->s_bdev_file, size)) + return 0; + /* If we get here, we know size is validated */ sb->s_blocksize = size; sb->s_blocksize_bits = blksize_bits(size); return sb->s_blocksize; @@ -192,7 +245,7 @@ int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; - return filemap_flush(bdev->bd_inode->i_mapping); + return filemap_flush(bdev->bd_mapping); } EXPORT_SYMBOL_GPL(sync_blockdev_nowait); @@ -204,13 +257,13 @@ int sync_blockdev(struct block_device *bdev) { if (!bdev) return 0; - return filemap_write_and_wait(bdev->bd_inode->i_mapping); + return filemap_write_and_wait(bdev->bd_mapping); } EXPORT_SYMBOL(sync_blockdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) { - return filemap_write_and_wait_range(bdev->bd_inode->i_mapping, + return filemap_write_and_wait_range(bdev->bd_mapping, lstart, lend); } EXPORT_SYMBOL(sync_blockdev_range); @@ -313,6 +366,11 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); + + if (security_bdev_alloc(&ei->bdev)) { + kmem_cache_free(bdev_cachep, ei); + return NULL; + } return &ei->vfs_inode; } @@ -322,6 +380,7 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); + security_bdev_free(bdev); if (!bdev_is_partition(bdev)) { if (bdev->bd_disk && bdev->bd_disk->bdi) @@ -374,7 +433,7 @@ static struct file_system_type bd_type = { }; struct super_block *blockdev_superblock __ro_after_init; -struct vfsmount *blockdev_mnt __ro_after_init; +static struct vfsmount *blockdev_mnt __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) @@ -411,13 +470,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); mutex_init(&bdev->bd_holder_lock); - bdev->bd_partno = partno; - bdev->bd_inode = inode; + atomic_set(&bdev->__bd_flags, partno); + bdev->bd_mapping = &inode->i_data; bdev->bd_queue = disk->queue; - if (partno) - bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio; - else - bdev->bd_has_submit_bio = false; + if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) + bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); @@ -430,19 +487,30 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); - i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } void bdev_add(struct block_device *bdev, dev_t dev) { + struct inode *inode = BD_INODE(bdev); if (bdev_stable_writes(bdev)) - mapping_set_stable_writes(bdev->bd_inode->i_mapping); + mapping_set_stable_writes(bdev->bd_mapping); bdev->bd_dev = dev; - bdev->bd_inode->i_rdev = dev; - bdev->bd_inode->i_ino = dev; - insert_inode_hash(bdev->bd_inode); + inode->i_rdev = dev; + inode->i_ino = dev; + insert_inode_hash(inode); +} + +void bdev_unhash(struct block_device *bdev) +{ + remove_inode_hash(BD_INODE(bdev)); +} + +void bdev_drop(struct block_device *bdev) +{ + iput(BD_INODE(bdev)); } long nr_blockdev_pages(void) @@ -528,7 +596,7 @@ retry: /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { - wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); @@ -551,7 +619,7 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) /* tell others that we're done */ BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); + wake_up_var(&whole->bd_claiming); } /** @@ -620,7 +688,7 @@ static void bd_end_claim(struct block_device *bdev, void *holder) bdev->bd_holder = NULL; bdev->bd_holder_ops = NULL; mutex_unlock(&bdev->bd_holder_lock); - if (bdev->bd_write_holder) + if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) unblock = true; } if (!whole->bd_holders) @@ -633,7 +701,7 @@ static void bd_end_claim(struct block_device *bdev, void *holder) */ if (unblock) { disk_unblock_events(bdev->bd_disk); - bdev->bd_write_holder = false; + bdev_clear_flag(bdev, BD_WRITE_HOLDER); } } @@ -747,13 +815,13 @@ static void blkdev_put_part(struct block_device *part) blkdev_put_whole(whole); } -struct block_device *blkdev_get_no_open(dev_t dev) +struct block_device *blkdev_get_no_open(dev_t dev, bool autoload) { struct block_device *bdev; struct inode *inode; inode = ilookup(blockdev_superblock, dev); - if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { + if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { blk_request_module(dev); inode = ilookup(blockdev_superblock, dev); if (inode) @@ -900,9 +968,10 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, * writeable reference is too fragile given the way @mode is * used in blkdev_get/put(). */ - if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && + if ((mode & BLK_OPEN_WRITE) && + !bdev_test_flag(bdev, BD_WRITE_HOLDER) && (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; + bdev_set_flag(bdev, BD_WRITE_HOLDER); unblock_events = false; } } @@ -912,12 +981,12 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, disk_unblock_events(disk); bdev_file->f_flags |= O_LARGEFILE; - bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + bdev_file->f_mode |= FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; if (mode & BLK_OPEN_RESTRICT_WRITES) bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; - bdev_file->f_mapping = bdev->bd_inode->i_mapping; + bdev_file->f_mapping = bdev->bd_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); bdev_file->private_data = holder; @@ -974,18 +1043,18 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, if (ret) return ERR_PTR(ret); - bdev = blkdev_get_no_open(dev); + bdev = blkdev_get_no_open(dev, true); if (!bdev) return ERR_PTR(-ENXIO); flags = blk_to_file_flags(mode); - bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode, + bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); if (IS_ERR(bdev_file)) { blkdev_put_no_open(bdev); return bdev_file; } - ihold(bdev->bd_inode); + ihold(BD_INODE(bdev)); ret = bdev_open(bdev, mode, holder, hops, bdev_file); if (ret) { @@ -1239,27 +1308,54 @@ void sync_bdevs(bool wait) } /* - * Handle STATX_DIOALIGN for block devices. - * - * Note that the inode passed to this is the inode of a block device node file, - * not the block device's internal inode. Therefore it is *not* valid to use - * I_BDEV() here; the block device has to be looked up by i_rdev instead. + * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. */ -void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) { struct block_device *bdev; - bdev = blkdev_get_no_open(inode->i_rdev); + /* + * Note that d_backing_inode() returns the block device node inode, not + * the block device's internal inode. Therefore it is *not* valid to + * use I_BDEV() here; the block device has to be looked up by i_rdev + * instead. + */ + bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false); if (!bdev) return; - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; - stat->dio_offset_align = bdev_logical_block_size(bdev); - stat->result_mask |= STATX_DIOALIGN; + if (request_mask & STATX_DIOALIGN) { + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + stat->result_mask |= STATX_DIOALIGN; + } + + if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { + struct request_queue *bd_queue = bdev->bd_queue; + + generic_fill_statx_atomic_writes(stat, + queue_atomic_write_unit_min_bytes(bd_queue), + queue_atomic_write_unit_max_bytes(bd_queue), + 0); + } + + stat->blksize = bdev_io_min(bdev); blkdev_put_no_open(bdev); } +bool disk_live(struct gendisk *disk) +{ + return !inode_unhashed(BD_INODE(disk->part0)); +} +EXPORT_SYMBOL_GPL(disk_live); + +unsigned int block_size(struct block_device *bdev) +{ + return 1 << BD_INODE(bdev)->i_blkbits; +} +EXPORT_SYMBOL_GPL(block_size); + static int __init setup_bdev_allow_write_mounted(char *str) { if (kstrtobool(str, &bdev_allow_write_mounted)) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index d442ee358fc2..9fb9f3533150 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -679,12 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(old_parent); - if (entity->parent && - entity->parent->last_bfqq_created == bfqq) - entity->parent->last_bfqq_created = NULL; - else if (bfqd->last_bfqq_created == bfqq) - bfqd->last_bfqq_created = NULL; - + bfq_reassign_last_bfqq(bfqq, NULL); entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ @@ -797,57 +792,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ bfq_link_bfqg(bfqd, bfqg); __bfq_bic_change_cgroup(bfqd, bic, bfqg); - /* - * Update blkg_path for bfq_log_* functions. We cache this - * path, and update it here, for the following - * reasons. Operations on blkg objects in blk-cgroup are - * protected with the request_queue lock, and not with the - * lock that protects the instances of this scheduler - * (bfqd->lock). This exposes BFQ to the following sort of - * race. - * - * The blkg_lookup performed in bfq_get_queue, protected - * through rcu, may happen to return the address of a copy of - * the original blkg. If this is the case, then the - * bfqg_and_blkg_get performed in bfq_get_queue, to pin down - * the blkg, is useless: it does not prevent blk-cgroup code - * from destroying both the original blkg and all objects - * directly or indirectly referred by the copy of the - * blkg. - * - * On the bright side, destroy operations on a blkg invoke, as - * a first step, hooks of the scheduler associated with the - * blkg. And these hooks are executed with bfqd->lock held for - * BFQ. As a consequence, for any blkg associated with the - * request queue this instance of the scheduler is attached - * to, we are guaranteed that such a blkg is not destroyed, and - * that all the pointers it contains are consistent, while we - * are holding bfqd->lock. A blkg_lookup performed with - * bfqd->lock held then returns a fully consistent blkg, which - * remains consistent until this lock is held. - * - * Thanks to the last fact, and to the fact that: (1) bfqg has - * been obtained through a blkg_lookup in the above - * assignment, and (2) bfqd->lock is being held, here we can - * safely use the policy data for the involved blkg (i.e., the - * field bfqg->pd) to get to the blkg associated with bfqg, - * and then we can safely use any field of blkg. After we - * release bfqd->lock, even just getting blkg through this - * bfqg may cause dangling references to be traversed, as - * bfqg->pd may not exist any more. - * - * In view of the above facts, here we cache, in the bfqg, any - * blkg data we may need for this bic, and for its associated - * bfq_queue. As of now, we need to cache only the path of the - * blkg, which is used in the bfq_log_* functions. - * - * Finally, note that bfqg itself needs to be protected from - * destruction on the blkg_free of the original blkg (which - * invokes bfq_pd_free). We use an additional private - * refcounter for bfqg, to let it disappear only after no - * bfq_queue refers to it any longer. - */ - blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4b88a54a9b76..0cb1e9873aab 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -582,23 +582,31 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, #define BFQ_LIMIT_INLINE_DEPTH 16 #ifdef CONFIG_BFQ_GROUP_IOSCHED -static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +static bool bfqq_request_over_limit(struct bfq_data *bfqd, + struct bfq_io_cq *bic, blk_opf_t opf, + unsigned int act_idx, int limit) { - struct bfq_data *bfqd = bfqq->bfqd; - struct bfq_entity *entity = &bfqq->entity; struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; struct bfq_entity **entities = inline_entities; - int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH; - int class_idx = bfqq->ioprio_class - 1; + int alloc_depth = BFQ_LIMIT_INLINE_DEPTH; struct bfq_sched_data *sched_data; + struct bfq_entity *entity; + struct bfq_queue *bfqq; unsigned long wsum; bool ret = false; - - if (!entity->on_st_or_in_serv) - return false; + int depth; + int level; retry: spin_lock_irq(&bfqd->lock); + bfqq = bic_to_bfqq(bic, op_is_sync(opf), act_idx); + if (!bfqq) + goto out; + + entity = &bfqq->entity; + if (!entity->on_st_or_in_serv) + goto out; + /* +1 for bfqq entity, root cgroup not included */ depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; if (depth > alloc_depth) { @@ -643,7 +651,7 @@ retry: * class. */ wsum = 0; - for (i = 0; i <= class_idx; i++) { + for (i = 0; i <= bfqq->ioprio_class - 1; i++) { wsum = wsum * IOPRIO_BE_NR + sched_data->service_tree[i].wsum; } @@ -666,7 +674,9 @@ out: return ret; } #else -static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +static bool bfqq_request_over_limit(struct bfq_data *bfqd, + struct bfq_io_cq *bic, blk_opf_t opf, + unsigned int act_idx, int limit) { return false; } @@ -704,8 +714,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) } for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { - struct bfq_queue *bfqq = - bic_to_bfqq(bic, op_is_sync(opf), act_idx); + /* Fast path to check if bfqq is already allocated. */ + if (!bic_to_bfqq(bic, op_is_sync(opf), act_idx)) + continue; /* * Does queue (or any parent entity) exceed number of @@ -713,7 +724,7 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * limit depth so that it cannot consume more * available requests and thus starve other entities. */ - if (bfqq && bfqq_request_over_limit(bfqq, limit)) { + if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { depth = 1; break; } @@ -2911,8 +2922,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* if a merge has already been setup, then proceed with that first */ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; + new_bfqq = bfqq->new_bfqq; + if (new_bfqq) { + while (new_bfqq->new_bfqq) + new_bfqq = new_bfqq->new_bfqq; + return new_bfqq; + } /* * Check delayed stable merge for rotational or non-queueing @@ -3093,8 +3108,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } -static void -bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, + struct bfq_queue *new_bfqq) { if (cur_bfqq->entity.parent && cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) @@ -3125,10 +3140,12 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void -bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +static struct bfq_queue *bfq_merge_bfqqs(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_queue *bfqq) { + struct bfq_queue *new_bfqq = bfqq->new_bfqq; + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", (unsigned long)new_bfqq->pid); /* Save weight raising and idle window of the merged queues */ @@ -3222,6 +3239,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_reassign_last_bfqq(bfqq, new_bfqq); bfq_release_process_ref(bfqd, bfqq); + + return new_bfqq; } static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, @@ -3257,14 +3276,8 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * fulfilled, i.e., bic can be redirected to new_bfqq * and bfqq can be put. */ - bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, - new_bfqq); - /* - * If we get here, bio will be queued into new_queue, - * so use new_bfqq to decide whether bio and rq can be - * merged. - */ - bfqq = new_bfqq; + while (bfqq != new_bfqq) + bfqq = bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq); /* * Change also bqfd->bio_bfqq, as @@ -5463,40 +5476,42 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, } } +static void _bfq_exit_icq(struct bfq_io_cq *bic, unsigned int num_actuators) +{ + struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; + unsigned int act_idx; + + for (act_idx = 0; act_idx < num_actuators; act_idx++) { + if (bfqq_data[act_idx].stable_merge_bfqq) + bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); + + bfq_exit_icq_bfqq(bic, true, act_idx); + bfq_exit_icq_bfqq(bic, false, act_idx); + } +} + static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); struct bfq_data *bfqd = bic_to_bfqd(bic); unsigned long flags; - unsigned int act_idx; + /* * If bfqd and thus bfqd->num_actuators is not available any * longer, then cycle over all possible per-actuator bfqqs in * next loop. We rely on bic being zeroed on creation, and * therefore on its unused per-actuator fields being NULL. - */ - unsigned int num_actuators = BFQ_MAX_ACTUATORS; - struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; - - /* + * * bfqd is NULL if scheduler already exited, and in that case * this is the last time these queues are accessed. */ if (bfqd) { spin_lock_irqsave(&bfqd->lock, flags); - num_actuators = bfqd->num_actuators; - } - - for (act_idx = 0; act_idx < num_actuators; act_idx++) { - if (bfqq_data[act_idx].stable_merge_bfqq) - bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); - - bfq_exit_icq_bfqq(bic, true, act_idx); - bfq_exit_icq_bfqq(bic, false, act_idx); - } - - if (bfqd) + _bfq_exit_icq(bic, bfqd->num_actuators); spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + _bfq_exit_icq(bic, BFQ_MAX_ACTUATORS); + } } /* @@ -5699,9 +5714,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, * state before killing it. */ bfqq->bic = bic; - bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); - - return new_bfqq; + return bfq_merge_bfqqs(bfqd, bic, bfqq); } /* @@ -6156,6 +6169,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bool waiting, idle_timer_disabled = false; if (new_bfqq) { + struct bfq_queue *old_bfqq = bfqq; /* * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. @@ -6172,18 +6186,18 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * new_bfqq. */ if (bic_to_bfqq(RQ_BIC(rq), true, - bfq_actuator_index(bfqd, rq->bio)) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); + bfq_actuator_index(bfqd, rq->bio)) == bfqq) { + while (bfqq != new_bfqq) + bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq); + } - bfq_clear_bfqq_just_created(bfqq); + bfq_clear_bfqq_just_created(old_bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq */ - bfq_put_queue(bfqq); + bfq_put_queue(old_bfqq); rq->elv.priv[1] = new_bfqq; - bfqq = new_bfqq; } bfq_update_io_thinktime(bfqd, bfqq); @@ -6721,7 +6735,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - if (bfqq_process_refs(bfqq) == 1) { + if (bfqq_process_refs(bfqq) == 1 && !bfqq->new_bfqq) { bfqq->pid = current->pid; bfq_clear_bfqq_coop(bfqq); bfq_clear_bfqq_split_coop(bfqq); @@ -6736,11 +6750,10 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return NULL; } -static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct bio *bio, - bool split, bool is_sync, - bool *new_queue) +static struct bfq_queue * +__bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bio *bio, bool split, bool is_sync, + bool *new_queue) { unsigned int act_idx = bfq_actuator_index(bfqd, bio); struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); @@ -6819,6 +6832,92 @@ static void bfq_prepare_request(struct request *rq) rq->elv.priv[0] = rq->elv.priv[1] = NULL; } +static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq) +{ + struct bfq_queue *new_bfqq = bfqq->new_bfqq; + struct bfq_queue *waker_bfqq = bfqq->waker_bfqq; + + if (!waker_bfqq) + return NULL; + + while (new_bfqq) { + if (new_bfqq == waker_bfqq) { + /* + * If waker_bfqq is in the merge chain, and current + * is the only process, waker_bfqq can be freed. + */ + if (bfqq_process_refs(waker_bfqq) == 1) + return NULL; + + return waker_bfqq; + } + + new_bfqq = new_bfqq->new_bfqq; + } + + /* + * If waker_bfqq is not in the merge chain, and it's procress reference + * is 0, waker_bfqq can be freed. + */ + if (bfqq_process_refs(waker_bfqq) == 0) + return NULL; + + return waker_bfqq; +} + +static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bio *bio, + unsigned int idx, + bool is_sync) +{ + struct bfq_queue *waker_bfqq; + struct bfq_queue *bfqq; + bool new_queue = false; + + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); + if (unlikely(new_queue)) + return bfqq; + + /* If the queue was seeky for too long, break it apart. */ + if (!bfq_bfqq_coop(bfqq) || !bfq_bfqq_split_coop(bfqq) || + bic->bfqq_data[idx].stably_merged) + return bfqq; + + waker_bfqq = bfq_waker_bfqq(bfqq); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->bfqq_data[idx].saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); + if (bfqq) { + bfq_bfqq_resume_state(bfqq, bfqd, bic, true); + return bfqq; + } + + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); + if (unlikely(bfqq == &bfqd->oom_bfqq)) + return bfqq; + + bfq_bfqq_resume_state(bfqq, bfqd, bic, false); + bfqq->waker_bfqq = waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + + return bfqq; +} + /* * If needed, init rq, allocate bfq data structures associated with * rq, and increment reference counters in the destination bfq_queue @@ -6850,8 +6949,6 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; - bool new_queue = false; - bool bfqq_already_existing = false, split = false; unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) @@ -6868,54 +6965,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) return RQ_BFQQ(rq); bic = icq_to_bic(rq->elv.icq); - bfq_check_ioprio_change(bic, bio); - bfq_bic_update_cgroup(bic, bio); - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); - - if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && - !bic->bfqq_data[a_idx].stably_merged) { - struct bfq_queue *old_bfqq = bfqq; - - /* Update bic before losing reference to bfqq */ - if (bfq_bfqq_in_large_burst(bfqq)) - bic->bfqq_data[a_idx].saved_in_large_burst = - true; - - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - - if (!bfqq) { - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, - true, is_sync, - NULL); - if (unlikely(bfqq == &bfqd->oom_bfqq)) - bfqq_already_existing = true; - } else - bfqq_already_existing = true; - - if (!bfqq_already_existing) { - bfqq->waker_bfqq = old_bfqq->waker_bfqq; - bfqq->tentative_waker_bfqq = NULL; - - /* - * If the waker queue disappears, then - * new_bfqq->waker_bfqq must be - * reset. So insert new_bfqq into the - * woken_list of the waker. See - * bfq_check_waker for details. - */ - if (bfqq->waker_bfqq) - hlist_add_head(&bfqq->woken_list_node, - &bfqq->waker_bfqq->woken_list); - } - } - } + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, a_idx, is_sync); bfqq_request_allocated(bfqq); bfqq->ref++; @@ -6932,18 +6984,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) * addition, if the queue has also just been split, we have to * resume its state. */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq && + bfqq_process_refs(bfqq) == 1) bfqq->bic = bic; - if (split) { - /* - * The queue has just been split from a shared - * queue: restore the idle window and the - * possible weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); - } - } /* * Consider bfqq as possibly belonging to a burst of newly @@ -7167,8 +7210,8 @@ static void bfq_exit_queue(struct elevator_queue *e) #endif blk_stat_disable_accounting(bfqd->queue); - clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); - wbt_enable_default(bfqd->queue->disk); + blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue); + set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags); kfree(bfqd); } @@ -7272,9 +7315,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) INIT_LIST_HEAD(&bfqd->dispatch); - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + hrtimer_setup(&bfqd->idle_slice_timer, bfq_idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); bfqd->queue_weights_tree = RB_ROOT_CACHED; #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -7355,7 +7397,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); - set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); + blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q); wbt_disable_default(q->disk); blk_stat_enable_accounting(q); @@ -7579,7 +7621,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, #define BFQ_ATTR(name) \ __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store) -static struct elv_fs_entry bfq_attrs[] = { +static const struct elv_fs_entry bfq_attrs[] = { BFQ_ATTR(fifo_expire_sync), BFQ_ATTR(fifo_expire_async), BFQ_ATTR(back_seek_max), diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 467e8cfc41a2..687a3a7ba784 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1003,9 +1003,6 @@ struct bfq_group { /* must be the first member */ struct blkg_policy_data pd; - /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ - char blkg_path[128]; - /* reference counter (see comments in bfq_bic_update_cgroup) */ refcount_t ref; @@ -1159,6 +1156,8 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); void bfq_add_bfqq_busy(struct bfq_queue *bfqq); void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); +void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, + struct bfq_queue *new_bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ @@ -1186,11 +1185,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); "%s " fmt, pid_str, ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - blk_add_cgroup_trace_msg((bfqd)->queue, \ - &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ -} while (0) - #else /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ @@ -1200,7 +1194,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) #endif /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c new file mode 100644 index 000000000000..9c6657664792 --- /dev/null +++ b/block/bio-integrity-auto.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen <martin.petersen@oracle.com> + * + * Automatically generate and verify integrity data on PI capable devices if the + * bio submitter didn't provide PI itself. This ensures that kernel verifies + * data integrity even if the file system (or other user of the block device) is + * not aware of PI. + */ +#include <linux/blk-integrity.h> +#include <linux/t10-pi.h> +#include <linux/workqueue.h> +#include "blk.h" + +struct bio_integrity_data { + struct bio *bio; + struct bvec_iter saved_bio_iter; + struct work_struct work; + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *bid_slab; +static mempool_t bid_pool; +static struct workqueue_struct *kintegrityd_wq; + +static void bio_integrity_finish(struct bio_integrity_data *bid) +{ + bid->bio->bi_integrity = NULL; + bid->bio->bi_opf &= ~REQ_INTEGRITY; + kfree(bvec_virt(bid->bip.bip_vec)); + mempool_free(bid, &bid_pool); +} + +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_data *bid = + container_of(work, struct bio_integrity_data, work); + struct bio *bio = bid->bio; + + blk_integrity_verify_iter(bio, &bid->saved_bio_iter); + bio_integrity_finish(bid); + bio_endio(bio); +} + +#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) +static bool bip_should_check(struct bio_integrity_payload *bip) +{ + return bip->bip_flags & BIP_CHECK_FLAGS; +} + +static bool bi_offload_capable(struct blk_integrity *bi) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return bi->tuple_size == sizeof(struct crc64_pi_tuple); + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return bi->tuple_size == sizeof(struct t10_pi_tuple); + default: + pr_warn_once("%s: unknown integrity checksum type:%d\n", + __func__, bi->csum_type); + fallthrough; + case BLK_INTEGRITY_CSUM_NONE: + return false; + } +} + +/** + * __bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * + * Normally I/O completion is done in interrupt context. However, verifying I/O + * integrity is a time-consuming task which must be run in process context. + * + * This function postpones completion accordingly. + */ +bool __bio_integrity_endio(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_data *bid = + container_of(bip, struct bio_integrity_data, bip); + + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && + bip_should_check(bip)) { + INIT_WORK(&bid->work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bid->work); + return false; + } + + bio_integrity_finish(bid); + return true; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Checks if the bio already has an integrity payload attached. If it does, the + * payload has been generated by another kernel subsystem, and we just pass it + * through. + * Otherwise allocates integrity payload and for writes the integrity metadata + * will be generated. For reads, the completion handler will verify the + * metadata. + */ +bool bio_integrity_prep(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_data *bid; + bool set_flags = true; + gfp_t gfp = GFP_NOIO; + unsigned int len; + void *buf; + + if (!bi) + return true; + + if (!bio_sectors(bio)) + return true; + + /* Already protected? */ + if (bio_integrity(bio)) + return true; + + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) { + if (bi_offload_capable(bi)) + return true; + set_flags = false; + } + break; + case REQ_OP_WRITE: + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->flags & BLK_INTEGRITY_NOGENERATE) { + if (bi_offload_capable(bi)) + return true; + set_flags = false; + gfp |= __GFP_ZERO; + } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + gfp |= __GFP_ZERO; + break; + default: + return true; + } + + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return true; + + /* Allocate kernel buffer for protection data */ + len = bio_integrity_bytes(bi, bio_sectors(bio)); + buf = kmalloc(len, gfp); + if (!buf) + goto err_end_io; + bid = mempool_alloc(&bid_pool, GFP_NOIO); + if (!bid) + goto err_free_buf; + bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); + + bid->bio = bio; + + bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); + + if (set_flags) { + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bid->bip.bip_flags |= BIP_IP_CHECKSUM; + if (bi->csum_type) + bid->bip.bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bid->bip.bip_flags |= BIP_CHECK_REFTAG; + } + + if (bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)) < len) + goto err_end_io; + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) + blk_integrity_generate(bio); + else + bid->saved_bio_iter = bio->bi_iter; + return true; + +err_free_buf: + kfree(buf); +err_end_io: + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return false; +} +EXPORT_SYMBOL(bio_integrity_prep); + +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + +static int __init blk_integrity_auto_init(void) +{ + bid_slab = kmem_cache_create("bio_integrity_data", + sizeof(struct bio_integrity_data), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) + panic("bio: can't create integrity pool\n"); + + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + return 0; +} +subsys_initcall(blk_integrity_auto_init); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2e3e8e04961e..cb94e9be26dc 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,32 +7,36 @@ */ #include <linux/blk-integrity.h> -#include <linux/mempool.h> -#include <linux/export.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/slab.h> #include "blk.h" -static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; +struct bio_integrity_alloc { + struct bio_integrity_payload bip; + struct bio_vec bvecs[]; +}; -void blk_flush_integrity(void) +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * + * Description: Free the integrity portion of a bio. + */ +void bio_integrity_free(struct bio *bio) { - flush_workqueue(kintegrityd_wq); + kfree(bio_integrity(bio)); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; } -static void __bio_integrity_free(struct bio_set *bs, - struct bio_integrity_payload *bip) +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs) { - if (bs && mempool_initialized(&bs->bio_integrity_pool)) { - if (bip->bip_vec) - bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_max_vcnt); - mempool_free(bip, &bs->bio_integrity_pool); - } else { - kfree(bip); - } + memset(bip, 0, sizeof(*bip)); + bip->bip_max_vcnt = nr_vecs; + if (nr_vecs) + bip->bip_vec = bvecs; + + bio->bi_integrity = bip; + bio->bi_opf |= REQ_INTEGRITY; } /** @@ -49,109 +53,61 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned inline_vecs; + struct bio_integrity_alloc *bia; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); - if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { - bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!bip)) + bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask); + if (unlikely(!bia)) return ERR_PTR(-ENOMEM); - - memset(bip, 0, sizeof(*bip)); - - /* always report as many vecs as asked explicitly, not inline vecs */ - bip->bip_max_vcnt = nr_vecs; - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, - &bip->bip_max_vcnt, gfp_mask); - if (!bip->bip_vec) - goto err; - } else { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_bio = bio; - bio->bi_integrity = bip; - bio->bi_opf |= REQ_INTEGRITY; - - return bip; -err: - __bio_integrity_free(bs, bip); - return ERR_PTR(-ENOMEM); + bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs); + return &bia->bip; } EXPORT_SYMBOL(bio_integrity_alloc); -static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, - bool dirty) +static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs) { int i; - for (i = 0; i < nr_vecs; i++) { - if (dirty && !PageCompound(bv[i].bv_page)) - set_page_dirty_lock(bv[i].bv_page); + for (i = 0; i < nr_vecs; i++) unpin_user_page(bv[i].bv_page); - } } static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) { - unsigned short nr_vecs = bip->bip_max_vcnt - 1; - struct bio_vec *copy = &bip->bip_vec[1]; - size_t bytes = bip->bip_iter.bi_size; - struct iov_iter iter; + unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *orig_bvecs = &bip->bip_vec[1]; + struct bio_vec *bounce_bvec = &bip->bip_vec[0]; + size_t bytes = bounce_bvec->bv_len; + struct iov_iter orig_iter; int ret; - iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); - ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter); WARN_ON_ONCE(ret != bytes); - bio_integrity_unpin_bvec(copy, nr_vecs, true); -} - -static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) -{ - bool dirty = bio_data_dir(bip->bip_bio) == READ; - - if (bip->bip_flags & BIP_COPY_USER) { - if (dirty) - bio_integrity_uncopy_user(bip); - kfree(bvec_virt(bip->bip_vec)); - return; - } - - bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); + bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs); } /** - * bio_integrity_free - Free bio integrity payload - * @bio: bio containing bip to be freed + * bio_integrity_unmap_user - Unmap user integrity payload + * @bio: bio containing bip to be unmapped * - * Description: Used to free the integrity portion of a bio. Usually - * called from bio_free(). + * Unmap the user mapped integrity portion of a bio. */ -void bio_integrity_free(struct bio *bio) +void bio_integrity_unmap_user(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - if (bip->bip_flags & BIP_BLOCK_INTEGRITY) + if (bip->bip_flags & BIP_COPY_USER) { + if (bio_data_dir(bio) == READ) + bio_integrity_uncopy_user(bip); kfree(bvec_virt(bip->bip_vec)); - else if (bip->bip_flags & BIP_INTEGRITY_USER) - bio_integrity_unmap_user(bip); + return; + } - __bio_integrity_free(bs, bip); - bio->bi_integrity = NULL; - bio->bi_opf &= ~REQ_INTEGRITY; + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt); } /** @@ -169,16 +125,10 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); - if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > - queue_max_hw_sectors(q)) - return 0; - if (bip->bip_vcnt > 0) { struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; - bool same_page = false; - if (bvec_try_merge_hw_page(q, bv, page, len, offset, - &same_page)) { + if (bvec_try_merge_hw_page(q, bv, page, len, offset)) { bip->bip_iter.bi_size += len; return len; } @@ -205,7 +155,7 @@ EXPORT_SYMBOL(bio_integrity_add_page); static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, int nr_vecs, unsigned int len, - unsigned int direction, u32 seed) + unsigned int direction) { bool write = direction == ITER_SOURCE; struct bio_integrity_payload *bip; @@ -241,7 +191,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, } if (write) - bio_integrity_unpin_bvec(bvec, nr_vecs, false); + bio_integrity_unpin_bvec(bvec, nr_vecs); else memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); @@ -252,8 +202,8 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, goto free_bip; } - bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; - bip->bip_iter.bi_sector = seed; + bip->bip_flags |= BIP_COPY_USER; + bip->bip_vcnt = nr_vecs; return 0; free_bip: bio_integrity_free(bio); @@ -263,7 +213,7 @@ free_buf: } static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, - int nr_vecs, unsigned int len, u32 seed) + int nr_vecs, unsigned int len) { struct bio_integrity_payload *bip; @@ -272,9 +222,8 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, return PTR_ERR(bip); memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); - bip->bip_flags |= BIP_INTEGRITY_USER; - bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_size = len; + bip->bip_vcnt = nr_vecs; return 0; } @@ -308,17 +257,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, - u32 seed) +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + size_t offset, bytes = iter->count; unsigned int direction, nr_bvecs; - struct iov_iter iter; int ret, nr_vecs; - size_t offset; bool copy; if (bio_integrity(bio)) @@ -331,8 +278,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, else direction = ITER_SOURCE; - iov_iter_ubuf(&iter, direction, ubuf, bytes); - nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { @@ -342,8 +288,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, pages = NULL; } - copy = !iov_iter_is_aligned(&iter, align, align); - ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + copy = !iov_iter_is_aligned(iter, align, align); + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; @@ -355,9 +301,9 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, if (copy) ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, - direction, seed); + direction); else - ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes); if (ret) goto release_pages; if (bvec != stack_vec) @@ -366,208 +312,60 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, return 0; release_pages: - bio_integrity_unpin_bvec(bvec, nr_bvecs, false); + bio_integrity_unpin_bvec(bvec, nr_bvecs); free_bvec: if (bvec != stack_vec) kfree(bvec); return ret; } -EXPORT_SYMBOL_GPL(bio_integrity_map_user); -/** - * bio_integrity_process - Process integrity metadata for a bio - * @bio: bio to generate/verify integrity metadata for - * @proc_iter: iterator to process - * @proc_fn: Pointer to the relevant processing function - */ -static blk_status_t bio_integrity_process(struct bio *bio, - struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) +static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); - blk_status_t ret = BLK_STS_OK; - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.tuple_size = bi->tuple_size; - iter.seed = proc_iter->bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - iter.pi_offset = bi->pi_offset; + if (meta->flags & IO_INTEGRITY_CHK_GUARD) + bip->bip_flags |= BIP_CHECK_GUARD; + if (meta->flags & IO_INTEGRITY_CHK_APPTAG) + bip->bip_flags |= BIP_CHECK_APPTAG; + if (meta->flags & IO_INTEGRITY_CHK_REFTAG) + bip->bip_flags |= BIP_CHECK_REFTAG; - __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = bvec_kmap_local(&bv); - - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - kunmap_local(kaddr); - - if (ret) - break; - - } - return ret; + bip->app_tag = meta->app_tag; } -/** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Checks if the bio already has an integrity payload attached. - * If it does, the payload has been generated by another kernel subsystem, - * and we just pass it through. Otherwise allocates integrity payload. - * The bio must have data direction, target device and start sector set priot - * to calling. In the WRITE case, integrity metadata will be generated using - * the block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -bool bio_integrity_prep(struct bio *bio) +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) { - struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - void *buf; - unsigned long start, end; - unsigned int len, nr_pages; - unsigned int bytes, offset, i; + unsigned int integrity_bytes; + int ret; + struct iov_iter it; if (!bi) - return true; - - if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) - return true; - - if (!bio_sectors(bio)) - return true; - - /* Already protected? */ - if (bio_integrity(bio)) - return true; - - if (bio_data_dir(bio) == READ) { - if (!bi->profile->verify_fn || - !(bi->flags & BLK_INTEGRITY_VERIFY)) - return true; - } else { - if (!bi->profile->generate_fn || - !(bi->flags & BLK_INTEGRITY_GENERATE)) - return true; - } - - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, GFP_NOIO); - if (unlikely(buf == NULL)) { - printk(KERN_ERR "could not allocate integrity buffer\n"); - goto err_end_io; - } - - end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ((unsigned long) buf) >> PAGE_SHIFT; - nr_pages = end - start; - - /* Allocate bio integrity payload and integrity vectors */ - bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (IS_ERR(bip)) { - printk(KERN_ERR "could not allocate data integrity bioset\n"); - kfree(buf); - goto err_end_io; - } - - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); - - if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) - bip->bip_flags |= BIP_IP_CHECKSUM; - - /* Map it */ - offset = offset_in_page(buf); - for (i = 0; i < nr_pages && len > 0; i++) { - bytes = PAGE_SIZE - offset; - - if (bytes > len) - bytes = len; - - if (bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset) < bytes) { - printk(KERN_ERR "could not attach integrity payload\n"); - goto err_end_io; - } - - buf += bytes; - len -= bytes; - offset = 0; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) { - bio_integrity_process(bio, &bio->bi_iter, - bi->profile->generate_fn); - } else { - bip->bio_iter = bio->bi_iter; - } - return true; - -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - + return -EINVAL; /* - * At the moment verify is called bio's iterator was advanced - * during split and completion, we need to rewind iterator to - * it's original position. + * original meta iterator can be bigger. + * process integrity info corresponding to current data buffer only. */ - bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, - bi->profile->verify_fn); - bio_integrity_free(bio); - bio_endio(bio); -} + it = meta->iter; + integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio)); + if (it.count < integrity_bytes) + return -EINVAL; -/** - * __bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -bool __bio_integrity_endio(struct bio *bio) -{ - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); + /* should fit into two bytes */ + BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16)); - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && - (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); - return false; - } + if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS)) + return -EINVAL; - bio_integrity_free(bio); - return true; + it.count = integrity_bytes; + ret = bio_integrity_map_user(bio, &it); + if (!ret) { + bio_uio_meta_to_bip(bio, meta); + bip_set_seed(bio_integrity(bio), meta->seed); + iov_iter_advance(&meta->iter, integrity_bytes); + meta->seed += bio_integrity_intervals(bi, bio_sectors(bio)); + } + return ret; } /** @@ -620,57 +418,14 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + bip = bio_integrity_alloc(bio, gfp_mask, 0); if (IS_ERR(bip)) return PTR_ERR(bip); - memcpy(bip->bip_vec, bip_src->bip_vec, - bip_src->bip_vcnt * sizeof(struct bio_vec)); - - bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; - bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; + bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; + bip->app_tag = bip_src->app_tag; return 0; } - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (mempool_initialized(&bs->bio_integrity_pool)) - return 0; - - if (mempool_init_slab_pool(&bs->bio_integrity_pool, - pool_size, bip_slab)) - return -1; - - if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { - mempool_exit(&bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - mempool_exit(&bs->bio_integrity_pool); - mempool_exit(&bs->bvec_integrity_pool); -} - -void __init bio_integrity_init(void) -{ - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIO_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} diff --git a/block/bio.c b/block/bio.c index d24420ed1c4c..3c0a558c90f5 100644 --- a/block/bio.c +++ b/block/bio.c @@ -4,7 +4,7 @@ */ #include <linux/mm.h> #include <linux/swap.h> -#include <linux/bio.h> +#include <linux/bio-integrity.h> #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/iocontext.h> @@ -77,7 +77,7 @@ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; - char name[8]; + char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); @@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; + bio->bi_write_stream = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -345,18 +346,29 @@ void bio_chain(struct bio *bio, struct bio *parent) } EXPORT_SYMBOL(bio_chain); -struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, - unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +/** + * bio_chain_and_submit - submit a bio after chaining it to another one + * @prev: bio to chain and submit + * @new: bio to chain to + * + * If @prev is non-NULL, chain it to @new and submit it. + * + * Return: @new. + */ +struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { - struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); - - if (bio) { - bio_chain(bio, new); - submit_bio(bio); + if (prev) { + bio_chain(prev, new); + submit_bio(prev); } - return new; } + +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, + unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +{ + return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); +} EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) @@ -600,7 +612,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_vecs > UIO_MAXIOV) + if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } @@ -816,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { @@ -907,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, - unsigned int len, unsigned int off, bool *same_page) + unsigned int len, unsigned int off) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -920,8 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return false; - *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (!*same_page) { + if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) @@ -934,133 +946,25 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, /* * Try to merge a page into a segment, while obeying the hardware segment - * size limit. This is not for normal read/write bios, but for passthrough - * or Zone Append operations that we can't split. + * size limit. + * + * This is kept around for the integrity metadata, which is still tries + * to build the initial bio to the hardware limit and doesn't have proper + * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, - struct page *page, unsigned len, unsigned offset, - bool *same_page) + struct page *page, unsigned len, unsigned offset) { unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; + phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; - return bvec_try_merge_page(bv, page, len, offset, same_page); -} - -/** - * bio_add_hw_page - attempt to add a page to a bio with hw constraints - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * @max_sectors: maximum number of sectors that can be added - * @same_page: return if the segment has been merged inside the same page - * - * Add a page to a bio while respecting the hardware max_sectors, max_segment - * and gap limitations. - */ -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page) -{ - unsigned int max_size = max_sectors << SECTOR_SHIFT; - - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return 0; - - len = min3(len, max_size, queue_max_segment_size(q)); - if (len > max_size - bio->bi_iter.bi_size) - return 0; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (bvec_try_merge_hw_page(q, bv, page, len, offset, - same_page)) { - bio->bi_iter.bi_size += len; - return len; - } - - if (bio->bi_vcnt >= - min(bio->bi_max_vecs, queue_max_segments(q))) - return 0; - - /* - * If the queue doesn't support SG gaps and adding this segment - * would create a gap, disallow it. - */ - if (bvec_gap_to_prev(&q->limits, bv, offset)) - return 0; - } - - bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); - bio->bi_vcnt++; - bio->bi_iter.bi_size += len; - return len; -} - -/** - * bio_add_pc_page - attempt to add page to passthrough bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by passthrough bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset) -{ - bool same_page = false; - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_hw_sectors(q), &same_page); + return bvec_try_merge_page(bv, page, len, offset); } -EXPORT_SYMBOL(bio_add_pc_page); - -/** - * bio_add_zone_append_page - attempt to add page to zone-append bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist of a bio that will be submitted - * for a zone-append request. This can fail for a number of reasons, such as the - * bio being full or the target block device is not a zoned block device or - * other limitations of the target block device. The target block device must - * allow bio's up to PAGE_SIZE, so it is always possible to add a single page - * to an empty bio. - * - * Returns: number of bytes added to the bio, or 0 in case of a failure. - */ -int bio_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - bool same_page = false; - - if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) - return 0; - - if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev))) - return 0; - - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_zone_append_sectors(q), &same_page); -} -EXPORT_SYMBOL_GPL(bio_add_zone_append_page); /** * __bio_add_page - add page(s) to a bio in a new segment @@ -1085,6 +989,22 @@ void __bio_add_page(struct bio *bio, struct page *page, EXPORT_SYMBOL_GPL(__bio_add_page); /** + * bio_add_virt_nofail - add data in the direct kernel mapping to a bio + * @bio: destination bio + * @vaddr: data to add + * @len: length of the data to add, may cross pages + * + * Add the data at @vaddr to @bio. The caller must have ensure a segment + * is available for the added data. No merging into an existing segment + * will be performed. + */ +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len) +{ + __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr)); +} +EXPORT_SYMBOL_GPL(bio_add_virt_nofail); + +/** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add @@ -1097,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page); int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - bool same_page = false; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > UINT_MAX - len) @@ -1106,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page, if (bio->bi_vcnt > 0 && bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, &same_page)) { + page, len, offset)) { bio->bi_iter.bi_size += len; return len; } @@ -1121,10 +1039,12 @@ EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { + unsigned long nr = off / PAGE_SIZE; + WARN_ON_ONCE(len > UINT_MAX); - WARN_ON_ONCE(off > UINT_MAX); - __bio_add_page(bio, &folio->page, len, off); + __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } +EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** * bio_add_folio - Attempt to add part of a folio to a bio. @@ -1143,18 +1063,74 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { - if (len > UINT_MAX || off > UINT_MAX) + unsigned long nr = off / PAGE_SIZE; + + if (len > UINT_MAX) return false; - return bio_add_page(bio, &folio->page, len, off) > 0; + return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); +/** + * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio and return how many bytes were added. + * This may be less than the amount originally asked. Returns 0 if no data + * could be added to @bio. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len) +{ + unsigned int offset = offset_in_page(vaddr); + + len = min(len, PAGE_SIZE - offset); + if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len) + return 0; + if (op_is_write(bio_op(bio))) + flush_kernel_vmap_range(vaddr, len); + return len; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk); + +/** + * bio_add_vmalloc - add a vmalloc region to a bio + * @bio: destination bio + * @vaddr: vmalloc address to add + * @len: total length in bytes of the data to add + * + * Add data starting at @vaddr to @bio. Return %true on success or %false if + * @bio does not have enough space for the payload. + * + * This helper calls flush_kernel_vmap_range() for the range added. For reads + * the caller still needs to manually call invalidate_kernel_vmap_range() in + * the completion handler. + */ +bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len) +{ + do { + unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len); + + if (!added) + return false; + vaddr += added; + len -= added; + } while (len); + + return true; +} +EXPORT_SYMBOL_GPL(bio_add_vmalloc); + void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { - struct page *page; size_t nr_pages; if (mark_dirty) { @@ -1162,68 +1138,52 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } - page = folio_page(fi.folio, fi.offset / PAGE_SIZE); nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; - do { - bio_release_page(bio, page++); - } while (--nr_pages != 0); + unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); -void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { - size_t size = iov_iter_count(iter); - WARN_ON_ONCE(bio->bi_max_vecs); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - size_t max_sectors = queue_max_zone_append_sectors(q); - - size = min(size, max_sectors << SECTOR_SHIFT); - } - bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = size; + bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +static unsigned int get_contig_folio_len(unsigned int *num_pages, + struct page **pages, unsigned int i, + struct folio *folio, size_t left, + size_t offset) { - bool same_page = false; + size_t bytes = left; + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); + unsigned int j; - if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) - return -EIO; + /* + * We might COW a single page in the middle of + * a large folio, so we have to check that all + * pages belong to the same folio. + */ + bytes -= contig_sz; + for (j = i + 1; j < i + *num_pages; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); - if (bio->bi_vcnt > 0 && - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, &same_page)) { - bio->bi_iter.bi_size += len; - if (same_page) - bio_release_page(bio, page); - return 0; + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) { + break; + } + contig_sz += next; + bytes -= next; } - __bio_add_page(bio, page, len, offset); - return 0; -} + *num_pages = j - i; -static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - bool same_page = false; - - if (bio_add_hw_page(q, bio, page, len, offset, - queue_max_zone_append_sectors(q), &same_page) != len) - return -EINVAL; - if (same_page) - bio_release_page(bio, page); - return 0; + return contig_sz; } #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) @@ -1245,9 +1205,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i = 0; - size_t offset; + ssize_t size; + unsigned int num_pages, i = 0; + size_t offset, folio_offset, left, len; int ret = 0; /* @@ -1287,18 +1247,39 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) goto out; } - for (left = size, i = 0; left > 0; left -= len, i++) { + for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); + unsigned int old_vcnt = bio->bi_vcnt; - len = min_t(size_t, PAGE_SIZE - offset, left); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = bio_iov_add_zone_append_page(bio, page, len, - offset); - if (ret) - break; - } else - bio_iov_add_page(bio, page, len, offset); + folio_offset = ((size_t)folio_page_idx(folio, page) << + PAGE_SHIFT) + offset; + len = min(folio_size(folio) - folio_offset, left); + + num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + if (num_pages > 1) + len = get_contig_folio_len(&num_pages, pages, i, + folio, left, offset); + + if (!bio_add_folio(bio, folio, len, folio_offset)) { + WARN_ON_ONCE(1); + ret = -EINVAL; + goto out; + } + + if (bio_flagged(bio, BIO_PAGE_PINNED)) { + /* + * We're adding another fragment of a page that already + * was part of the last segment. Undo our pin as the + * page was pinned when an earlier fragment of it was + * added to the bio and __bio_release_pages expects a + * single pin per page. + */ + if (offset && bio->bi_vcnt == old_vcnt) + unpin_user_folio(folio, 1); + } offset = 0; } @@ -1384,6 +1365,56 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); +/** + * bdev_rw_virt - synchronously read into / write from kernel mapping + * @bdev: block device to access + * @sector: sector to access + * @data: data to read/write + * @len: length in byte to read/write + * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE) + * + * Performs synchronous I/O to @bdev for @data/@len. @data must be in + * the kernel direct mapping and not a vmalloc address. + */ +int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, + size_t len, enum req_op op) +{ + struct bio_vec bv; + struct bio bio; + int error; + + if (WARN_ON_ONCE(is_vmalloc_addr(data))) + return -EIO; + + bio_init(&bio, bdev, &bv, 1, op); + bio.bi_iter.bi_sector = sector; + bio_add_virt_nofail(&bio, data, len); + error = submit_bio_wait(&bio); + bio_uninit(&bio); + return error; +} +EXPORT_SYMBOL_GPL(bdev_rw_virt); + +static void bio_wait_end_io(struct bio *bio) +{ + complete(bio->bi_private); + bio_put(bio); +} + +/* + * bio_await_chain - ends @bio and waits for every chained bio to complete + */ +void bio_await_chain(struct bio *bio) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = bio_wait_end_io; + bio_endio(bio); + blk_wait_io(&done); +} + void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) @@ -1576,6 +1607,8 @@ again: if (!bio_integrity_endio(bio)) return; + blk_zone_bio_endio(bio); + rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { @@ -1596,9 +1629,18 @@ again: goto again; } - blk_throtl_bio_endio(bio); - /* release cgroup info */ - bio_uninit(bio); +#ifdef CONFIG_BLK_CGROUP + /* + * Release cgroup info. We shouldn't have to do this here, but quite + * a few callers of bio_init fail to call bio_uninit, so we cover up + * for that here at least for now. + */ + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif + if (bio->bi_end_io) bio->bi_end_io(bio); } @@ -1623,16 +1665,22 @@ struct bio *bio_split(struct bio *bio, int sectors, { struct bio *split; - BUG_ON(sectors <= 0); - BUG_ON(sectors >= bio_sectors(bio)); + if (WARN_ON_ONCE(sectors <= 0)) + return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) + return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) - return NULL; + return ERR_PTR(-EINVAL); + + /* atomic writes cannot be split */ + if (bio->bi_opf & REQ_ATOMIC) + return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) - return NULL; + return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; @@ -1659,6 +1707,10 @@ EXPORT_SYMBOL(bio_split); */ void bio_trim(struct bio *bio, sector_t offset, sector_t size) { + /* We should never trim an atomic write */ + if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) + return; + if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || offset + size > bio_sectors(bio))) return; @@ -1702,7 +1754,6 @@ void bioset_exit(struct bio_set *bs) mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); - bioset_integrity_free(bs); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; @@ -1782,8 +1833,6 @@ static int __init init_bio(void) BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); - bio_integrity_init(); - for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; @@ -1799,9 +1848,6 @@ static int __init init_bio(void) BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); - if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index 3304e841df7c..a55fb0c53558 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { int i, ret; - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); - if (ret) { - while (--i >= 0) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); - return ret; - } + ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR); + if (ret) + return ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) atomic64_set(&rwstat->aux_cnt[i], 0); - } return 0; } EXPORT_SYMBOL_GPL(blkg_rwstat_init); void blkg_rwstat_exit(struct blkg_rwstat *rwstat) { - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); + percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR); } EXPORT_SYMBOL_GPL(blkg_rwstat_exit); diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h index 022527b0b043..703a16fe1404 100644 --- a/block/blk-cgroup-rwstat.h +++ b/block/blk-cgroup-rwstat.h @@ -52,7 +52,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat - * @op: REQ_OP and flags + * @opf: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The @@ -83,8 +83,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read + * @result: where to put the current values * - * Read the current snapshot of @rwstat and return it in the aux counts. + * Read the current snapshot of @rwstat and return it in the @result counts. */ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, struct blkg_rwstat_sample *result) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 059467086b13..5936db7f8475 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work) /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); - bio_list_merge(&bios, &blkg->async_bios); - bio_list_init(&blkg->async_bios); + bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ @@ -323,6 +322,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; + blkg->iostat.blkg = blkg; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); @@ -619,13 +619,47 @@ restart: spin_unlock_irq(&q->queue_lock); } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void __blkg_clear_stat(struct blkg_iostat_set *bis) +{ + struct blkg_iostat cur = {0}; + unsigned long flags; + + flags = u64_stats_update_begin_irqsave(&bis->sync); + blkg_iostat_set(&bis->cur, &cur); + blkg_iostat_set(&bis->last, &cur); + u64_stats_update_end_irqrestore(&bis->sync, flags); +} + +static void blkg_clear_stat(struct blkcg_gq *blkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); + + __blkg_clear_stat(s); + } + __blkg_clear_stat(&blkg->iostat); +} + static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i, cpu; + int i; + pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -635,18 +669,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - for_each_possible_cpu(cpu) { - struct blkg_iostat_set *bis = - per_cpu_ptr(blkg->iostat_cpu, cpu); - memset(bis, 0, sizeof(*bis)); - - /* Re-initialize the cleared blkg_iostat_set */ - u64_stats_init(&bis->sync); - bis->blkg = blkg; - } - memset(&blkg->iostat, 0, sizeof(blkg->iostat)); - u64_stats_init(&blkg->iostat.sync); - + blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -774,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) return -EINVAL; input = skip_spaces(input); - bdev = blkdev_get_no_open(MKDEV(major, minor)); + bdev = blkdev_get_no_open(MKDEV(major, minor), false); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { @@ -793,6 +816,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) ctx->bdev = bdev; return 0; } +/* + * Similar to blkg_conf_open_bdev, but additionally freezes the queue, + * acquires q->elevator_lock, and ensures the correct locking order + * between q->elevator_lock and q->rq_qos_mutex. + * + * This function returns negative error on failure. On success it returns + * memflags which must be saved and later passed to blkg_conf_exit_frozen + * for restoring the memalloc scope. + */ +unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) +{ + int ret; + unsigned long memflags; + + if (ctx->bdev) + return -EINVAL; + + ret = blkg_conf_open_bdev(ctx); + if (ret < 0) + return ret; + /* + * At this point, we haven’t started protecting anything related to QoS, + * so we release q->rq_qos_mutex here, which was first acquired in blkg_ + * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing + * the queue and acquiring q->elevator_lock to maintain the correct + * locking order. + */ + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + + memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); + mutex_lock(&ctx->bdev->bd_queue->elevator_lock); + mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); + + return memflags; +} /** * blkg_conf_prep - parse and prepare for per-blkg config update @@ -949,13 +1007,19 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_exit); -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +/* + * Similar to blkg_conf_exit, but also unfreezes the queue and releases + * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen + * is used to open the bdev. + */ +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) { - int i; + if (ctx->bdev) { + struct request_queue *q = ctx->bdev->bd_queue; - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; + blkg_conf_exit(ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); } } @@ -1010,8 +1074,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) /* * For covering concurrent parent blkg update from blkg_release(). * - * When flushing from cgroup, cgroup_rstat_lock is always held, so - * this lock won't cause contention most of time. + * When flushing from cgroup, the subsystem rstat lock is always held, + * so this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); @@ -1024,7 +1088,19 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) struct blkg_iostat cur; unsigned int seq; + /* + * Order assignment of `next_bisc` from `bisc->lnode.next` in + * llist_for_each_entry_safe and clearing `bisc->lqueued` for + * avoiding to assign `next_bisc` with new next pointer added + * in blk_cgroup_bio_start() in case of re-ordering. + * + * The pair barrier is implied in llist_add() in blk_cgroup_bio_start(). + */ + smp_mb(); + WRITE_ONCE(bisc->lqueued, false); + if (bisc == &blkg->iostat) + goto propagate_up; /* propagate up to parent only */ /* fetch the current per-cpu values */ do { @@ -1034,10 +1110,24 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) blkcg_iostat_update(blkg, &cur, &bisc->last); +propagate_up: /* propagate global delta to parent (unless that's root) */ - if (parent && parent->parent) + if (parent && parent->parent) { blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); + /* + * Queue parent->iostat to its blkcg's lockless + * list to propagate up to the grandparent if the + * iostat hasn't been queued yet. + */ + if (!parent->iostat.lqueued) { + struct llist_head *plhead; + + plhead = per_cpu_ptr(parent->blkcg->lhead, cpu); + llist_add(&parent->iostat.lnode, plhead); + parent->iostat.lqueued = true; + } + } } raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: @@ -1054,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no - * cgroups are defined. For that reason, cgroup_rstat_flush in + * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * @@ -1100,6 +1190,7 @@ static void blkcg_fill_root_iostats(void) blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } + class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) @@ -1162,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else - cgroup_rstat_flush(blkcg->css.cgroup); + css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -1286,10 +1377,14 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { + struct blkcg *parent; + if (!refcount_dec_and_test(&blkcg->online_pin)) break; + + parent = blkcg_parent(blkcg); blkcg_destroy_blkgs(blkcg); - blkcg = blkcg_parent(blkcg); + blkcg = parent; } while (blkcg); } @@ -1420,7 +1515,6 @@ int blkcg_init_disk(struct gendisk *disk) struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; - int ret; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1440,21 +1534,8 @@ int blkcg_init_disk(struct gendisk *disk) if (preloaded) radix_tree_preload_end(); - ret = blk_ioprio_init(disk); - if (ret) - goto err_destroy_all; - - ret = blk_throtl_init(disk); - if (ret) - goto err_ioprio_exit; - return 0; -err_ioprio_exit: - blk_ioprio_exit(disk); -err_destroy_all: - blkg_destroy_all(disk); - return ret; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) @@ -1517,13 +1598,22 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; + unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) return 0; + /* + * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, + * for example, ioprio. Such policy will work on blkcg level, not disk + * level, and don't need to be activated. + */ + if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) + return -EINVAL; + if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); @@ -1587,7 +1677,7 @@ retry: spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) @@ -1631,12 +1721,13 @@ void blkcg_deactivate_policy(struct gendisk *disk, { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; + unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); @@ -1660,7 +1751,7 @@ void blkcg_deactivate_policy(struct gendisk *disk, mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); @@ -1688,24 +1779,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ - ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + ret = -ENOSPC; goto err_unlock; } - /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ - if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) - goto err_unlock; - /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1716,8 +1810,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) + if (!cpd) { + ret = -ENOMEM; goto err_free_cpds; + } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; @@ -1728,12 +1824,15 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->dfl_cftypes) + if (pol->dfl_cftypes == pol->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + } else { WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); - if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); + } mutex_unlock(&blkcg_pol_register_mutex); return 0; @@ -2144,18 +2243,19 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + css_rstat_updated(&blkcg->css, cpu); put_cpu(); } bool blk_cgroup_congested(void) { - struct cgroup_subsys_state *css; + struct blkcg *blkcg; bool ret = false; rcu_read_lock(); - for (css = blkcg_css(); css; css = css->parent) { - if (atomic_read(&css->cgroup->congestion_count)) { + for (blkcg = css_to_blkcg(blkcg_css()); blkcg; + blkcg = blkcg_parent(blkcg)) { + if (atomic_read(&blkcg->congestion_count)) { ret = true; break; } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 90b3959d88cf..81868ad86330 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -95,6 +95,8 @@ struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; + /* If there is block congestion on this cgroup. */ + atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; @@ -217,13 +219,17 @@ struct blkg_conf_ctx { void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); +unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg - * @return: true if this bio needs to be submitted with the root blkg context. + * @bio: the target &bio + * + * Return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning @@ -243,7 +249,7 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio) * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. - + * * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, @@ -266,7 +272,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, } /** - * blkg_to_pdata - get policy private data + * blkg_to_pd - get policy private data * @blkg: blkg of interest * @pol: policy of interest * @@ -285,7 +291,7 @@ static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, } /** - * pdata_to_blkg - get blkg associated with policy private data + * pd_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. @@ -301,19 +307,6 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) } /** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); -} - -/** * blkg_get - get a blkg reference * @blkg: blkg to get * @@ -387,7 +380,7 @@ static inline void blkcg_use_delay(struct blkcg_gq *blkg) if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) - atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) @@ -412,7 +405,7 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) if (old == 0) return 0; if (old == 1) - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + atomic_dec(&blkg->blkcg->congestion_count); return 1; } @@ -431,7 +424,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) - atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } @@ -448,7 +441,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + atomic_dec(&blkg->blkcg->congestion_count); } /** @@ -496,7 +489,6 @@ static inline void blkcg_deactivate_policy(struct gendisk *disk, static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } diff --git a/block/blk-core.c b/block/blk-core.c index b795ac177281..b862c66018f2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -94,20 +94,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL(blk_queue_flag_clear); -/** - * blk_queue_flag_test_and_set - atomically test and set a queue flag - * @flag: flag to be set - * @q: request queue - * - * Returns the previous value of @flag - 0 if the flag was not set and 1 if - * the flag was already set. - */ -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) -{ - return test_and_set_bit(flag, &q->queue_flags); -} -EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), @@ -174,6 +160,8 @@ static const struct { /* Command duration limit device-side timeout */ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, + [BLK_STS_INVAL] = { -EINVAL, "invalid" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -231,7 +219,7 @@ EXPORT_SYMBOL_GPL(blk_status_to_str); */ void blk_sync_queue(struct request_queue *q) { - del_timer_sync(&q->timeout); + timer_delete_sync(&q->timeout); cancel_work_sync(&q->timeout_work); } EXPORT_SYMBOL(blk_sync_queue); @@ -273,6 +261,8 @@ static void blk_free_queue(struct request_queue *q) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); + lockdep_unregister_key(&q->io_lock_cls_key); + lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -290,18 +280,20 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_queue_start_drain(struct request_queue *q) +bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ - blk_freeze_queue_start(q); + bool freeze = __blk_freeze_queue_start(q, current); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); + + return freeze; } /** @@ -333,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) return -ENODEV; } + rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } @@ -364,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio) goto dead; } + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); @@ -433,8 +429,8 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); + mutex_init(&q->elevator_lock); mutex_init(&q->sysfs_lock); - mutex_init(&q->sysfs_dir_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); @@ -453,6 +449,18 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); if (error) goto fail_stats; + lockdep_register_key(&q->io_lock_cls_key); + lockdep_register_key(&q->q_lock_cls_key); + lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", + &q->io_lock_cls_key, 0); + lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", + &q->q_lock_cls_key, 0); + + /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */ + fs_reclaim_acquire(GFP_KERNEL); + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); + fs_reclaim_release(GFP_KERNEL); q->nr_requests = BLKDEV_DEFAULT_RQ; @@ -496,7 +504,8 @@ __setup("fail_make_request=", setup_fail_make_request); bool should_fail_request(struct block_device *part, unsigned int bytes) { - return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); + return bdev_test_flag(part, BD_MAKE_IT_FAIL) && + should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -516,10 +525,11 @@ static inline void bio_check_ro(struct bio *bio) if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return; - if (bio->bi_bdev->bd_ro_warned) + if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED)) return; - bio->bi_bdev->bd_ro_warned = true; + bdev_set_flag(bio->bi_bdev, BD_RO_WARNED); + /* * Use ioctl to set underlying disk of raid/dm to read-only * will trigger this. @@ -591,8 +601,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || - !bio_zone_is_seq(bio)) + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* @@ -614,17 +623,30 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static void __submit_bio(struct bio *bio) { + /* If plug is not used, add new plug here to cache nsecs time. */ + struct blk_plug plug; + if (unlikely(!blk_crypto_bio_prep(&bio))) return; - if (!bio->bi_bdev->bd_has_submit_bio) { + blk_start_plug(&plug); + + if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; - - disk->fops->submit_bio(bio); + + if ((bio->bi_opf & REQ_POLLED) && + !(disk->queue->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + } else { + disk->fops->submit_bio(bio); + } blk_queue_exit(disk->queue); } + + blk_finish_plug(&plug); } /* @@ -725,12 +747,24 @@ void submit_bio_noacct_nocheck(struct bio *bio) */ if (current->bio_list) bio_list_add(¤t->bio_list[0], bio); - else if (!bio->bi_bdev->bd_has_submit_bio) + else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) __submit_bio_noacct_mq(bio); else __submit_bio_noacct(bio); } +static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, + struct bio *bio) +{ + if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) + return BLK_STS_INVAL; + + if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) + return BLK_STS_INVAL; + + return BLK_STS_OK; +} + /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. @@ -761,7 +795,8 @@ void submit_bio_noacct(struct bio *bio) if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; - if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) + if (bdev_is_partition(bdev) && + unlikely(blk_partition_remap(bio))) goto end_io; } @@ -773,7 +808,7 @@ void submit_bio_noacct(struct bio *bio) if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + if (!bdev_write_cache(bdev)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; @@ -782,12 +817,15 @@ void submit_bio_noacct(struct bio *bio) } } - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - bio_clear_polled(bio); - switch (bio_op(bio)) { case REQ_OP_READ: + break; case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + status = blk_validate_atomic_write_op_size(q, bio); + if (status != BLK_STS_OK) + goto end_io; + } break; case REQ_OP_FLUSH: /* @@ -816,11 +854,8 @@ void submit_bio_noacct(struct bio *bio) case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - if (!bdev_is_zoned(bio->bi_bdev)) - goto not_supported; - break; case REQ_OP_ZONE_RESET_ALL: - if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) + if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_DRV_IN: @@ -906,16 +941,9 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (cookie == BLK_QC_T_NONE) return 0; - /* - * As the requests that require a zone lock are not plugged in the - * first place, directly accessing the plug instead of using - * blk_mq_plug() should not have any consequences during flushing for - * zoned devices. - */ blk_flush_plug(current->plug, false); /* @@ -934,7 +962,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) } else { struct gendisk *disk = q->disk; - if (disk && disk->fops->poll_bio) + if ((q->limits.features & BLK_FEAT_POLL) && disk && + disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); @@ -987,11 +1016,12 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end) unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) - __part_stat_add(part, io_ticks, end ? now - stamp : 1); - } - if (part->bd_partno) { + if (unlikely(time_after(now, stamp)) && + likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && + (end || bdev_count_inflight(part))) + __part_stat_add(part, io_ticks, now - stamp); + + if (bdev_is_partition(part)) { part = bdev_whole(part); goto again; } @@ -1097,8 +1127,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) return; plug->cur_ktime = 0; - plug->mq_list = NULL; - plug->cached_rq = NULL; + rq_list_init(&plug->mq_list); + rq_list_init(&plug->cached_rqs); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; @@ -1194,7 +1224,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) * queue for cached requests, we don't want a blocked task holding * up a queue freeze/quiesce event. */ - if (unlikely(!rq_list_empty(plug->cached_rq))) + if (unlikely(!rq_list_empty(&plug->cached_rqs))) blk_mq_free_plug_rqs(plug); plug->cur_ktime = 0; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index b1e7415f8439..005c9157ffb3 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -87,7 +87,7 @@ static struct bio_set crypto_bio_split; * This is the key we set when evicting a keyslot. This *should* be the all 0's * key, but AES-XTS rejects that key, so we use some random bytes instead. */ -static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; +static u8 blank_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { @@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, key->size); if (err) { blk_crypto_fallback_evict_keyslot(slot); @@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -226,7 +227,7 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) split_bio = bio_split(bio, num_sectors, GFP_NOIO, &crypto_bio_split); - if (!split_bio) { + if (IS_ERR(split_bio)) { bio->bi_status = BLK_STS_RESOURCE; return false; } @@ -539,7 +540,7 @@ static int blk_crypto_fallback_init(void) if (blk_crypto_fallback_inited) return 0; - get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + get_random_bytes(blank_key, sizeof(blank_key)); err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) @@ -561,6 +562,7 @@ static int blk_crypto_fallback_init(void) blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops; blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + blk_crypto_fallback_profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 93a141979694..ccf6dff6ff6b 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -14,6 +14,7 @@ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ + unsigned int security_strength; /* security strength in bytes */ unsigned int ivsize; /* iv size in bytes */ }; @@ -82,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -129,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq) return false; } +static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 7fabc883e39f..94a155912bf1 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, return false; if (profile->max_dun_bytes_supported < cfg->dun_bytes) return false; + if (!(profile->key_types_supported & cfg->key_type)) + return false; return true; } @@ -463,6 +465,99 @@ bool blk_crypto_register(struct blk_crypto_profile *profile, EXPORT_SYMBOL_GPL(blk_crypto_register); /** + * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key + * @bdev: a block device that supports hardware-wrapped keys + * @eph_key: a hardware-wrapped key in ephemerally-wrapped form + * @eph_key_size: size of @eph_key in bytes + * @sw_secret: (output) the software secret + * + * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that + * it is used for I/O), ask the hardware to derive the secret which software can + * use for cryptographic tasks other than inline encryption. This secret is + * guaranteed to be cryptographically isolated from the inline encryption key, + * i.e. derived with a different KDF context. + * + * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support + * hardware-wrapped keys, -EBADMSG if the key isn't a valid + * ephemerally-wrapped key, or another -errno code. + */ +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.derive_sw_secret) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size, + sw_secret); + blk_crypto_hw_exit(profile); + return err; +} + +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.import_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size, + lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.generate_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.generate_key(profile, lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.prepare_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size, + eph_key); + blk_crypto_hw_exit(profile); + return ret; +} + +/** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device * @parent: the crypto profile for the parent device @@ -485,10 +580,12 @@ void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, child->max_dun_bytes_supported); for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) parent->modes_supported[i] &= child->modes_supported[i]; + parent->key_types_supported &= child->key_types_supported; } else { parent->max_dun_bytes_supported = 0; memset(parent->modes_supported, 0, sizeof(parent->modes_supported)); + parent->key_types_supported = 0; } } EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); @@ -521,6 +618,9 @@ bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, target->max_dun_bytes_supported) return false; + if (reference->key_types_supported & ~target->key_types_supported) + return false; + return true; } EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); @@ -555,5 +655,6 @@ void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, sizeof(dst->modes_supported)); dst->max_dun_bytes_supported = src->max_dun_bytes_supported; + dst->key_types_supported = src->key_types_supported; } EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index a304434489ba..e832f403f200 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -31,6 +31,13 @@ static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) return container_of(attr, struct blk_crypto_attr, attr); } +static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, struct blk_crypto_attr *attr, char *page) { @@ -43,20 +50,48 @@ static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, return sysfs_emit(page, "%u\n", profile->num_slots); } +static ssize_t raw_keys_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + /* Always show supported, since the file doesn't exist otherwise. */ + return sysfs_emit(page, "supported\n"); +} + #define BLK_CRYPTO_RO_ATTR(_name) \ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) +BLK_CRYPTO_RO_ATTR(hw_wrapped_keys); BLK_CRYPTO_RO_ATTR(max_dun_bits); BLK_CRYPTO_RO_ATTR(num_keyslots); +BLK_CRYPTO_RO_ATTR(raw_keys); + +static umode_t blk_crypto_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + if (a == &hw_wrapped_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return 0; + if (a == &raw_keys_attr && + !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_RAW)) + return 0; + + return 0444; +} static struct attribute *blk_crypto_attrs[] = { + &hw_wrapped_keys_attr.attr, &max_dun_bits_attr.attr, &num_keyslots_attr.attr, + &raw_keys_attr.attr, NULL, }; static const struct attribute_group blk_crypto_attr_group = { .attrs = blk_crypto_attrs, + .is_visible = blk_crypto_is_visible, }; /* diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 4d760b092deb..4b1ad84d1b5a 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -23,24 +23,28 @@ const struct blk_crypto_mode blk_crypto_modes[] = { .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, }, [BLK_ENCRYPTION_MODE_SM4_XTS] = { .name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, + .security_strength = 16, .ivsize = 16, }, }; @@ -76,9 +80,15 @@ static int __init bio_crypt_ctx_init(void) /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); - /* Sanity check that no algorithm exceeds the defined limits. */ + /* + * Validate the crypto mode properties. This ideally would be done with + * static assertions, but boot-time checks are the next best thing. + */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { - BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].keysize > + BLK_CRYPTO_MAX_RAW_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].security_strength > + blk_crypto_modes[i].keysize); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } @@ -315,17 +325,20 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. - * @raw_key: Pointer to the raw key. Must be the correct length for the chosen - * @crypto_mode; see blk_crypto_modes[]. + * @key_bytes: the bytes of the key + * @key_size: size of the key in bytes + * @key_type: type of the key -- either raw or hardware-wrapped * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used * @data_unit_size: the data unit size to use for en/decryption * * Return: 0 on success, -errno on failure. The caller is responsible for - * zeroizing both blk_key and raw_key when done with them. + * zeroizing both blk_key and key_bytes when done with them. */ -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *key_bytes, size_t key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) @@ -338,8 +351,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; - if (mode->keysize == 0) + switch (key_type) { + case BLK_CRYPTO_KEY_TYPE_RAW: + if (key_size != mode->keysize) + return -EINVAL; + break; + case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED: + if (key_size < mode->security_strength || + key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + return -EINVAL; + break; + default: return -EINVAL; + } if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; @@ -350,9 +374,10 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->crypto_cfg.key_type = key_type; blk_key->data_unit_size_bits = ilog2(data_unit_size); - blk_key->size = mode->keysize; - memcpy(blk_key->raw, raw_key, mode->keysize); + blk_key->size = key_size; + memcpy(blk_key->bytes, key_bytes, key_size); return 0; } @@ -372,8 +397,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev, bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { - return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_crypto_config_supported_natively(bdev, cfg); + if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) && + cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW) + return true; + return blk_crypto_config_supported_natively(bdev, cfg); } /** @@ -387,15 +414,21 @@ bool blk_crypto_config_supported(struct block_device *bdev, * an skcipher, and *should not* be called from the data path, since that might * cause a deadlock * - * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and - * blk-crypto-fallback is either disabled or the needed algorithm - * is disabled in the crypto API; or another -errno code. + * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does + * not support wrapped keys; -ENOPKG if the key is a raw key but the + * hardware does not support raw keys and blk-crypto-fallback is either + * disabled or the needed algorithm is disabled in the crypto API; or + * another -errno code if something else went wrong. */ int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; + if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) { + pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev); + return -EOPNOTSUPP; + } return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -436,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev, pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); + +static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_import_key_arg arg; + u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key)) + return -EINVAL; + + if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr), + arg.raw_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(raw_key, sizeof(raw_key)); + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_generate_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + ret = blk_crypto_generate_key(profile, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_prepare_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.lt_key_size > sizeof(lt_key)) + return -EINVAL; + + if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr), + arg.lt_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key); + if (ret < 0) + goto out; + if (ret > arg.eph_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.eph_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key, + arg.eph_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + memzero_explicit(eph_key, sizeof(eph_key)); + return ret; +} + +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + + if (!profile) + return -EOPNOTSUPP; + + switch (cmd) { + case BLKCRYPTOIMPORTKEY: + return blk_crypto_ioctl_import_key(profile, argp); + case BLKCRYPTOGENERATEKEY: + return blk_crypto_ioctl_generate_key(profile, argp); + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl_prepare_key(profile, argp); + default: + return -ENOTTY; + } +} diff --git a/block/blk-flush.c b/block/blk-flush.c index b0f314f4bc14..43d6152897a4 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -95,26 +95,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +blk_get_flush_queue(struct blk_mq_ctx *ctx) { - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; -} - -static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) -{ - unsigned int policy = 0; - - if (blk_rq_sectors(rq)) - policy |= REQ_FSEQ_DATA; - - if (fflags & (1UL << QUEUE_FLAG_WC)) { - if (rq->cmd_flags & REQ_PREFLUSH) - policy |= REQ_FSEQ_PREFLUSH; - if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && - (rq->cmd_flags & REQ_FUA)) - policy |= REQ_FSEQ_POSTFLUSH; - } - return policy; + return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) @@ -130,6 +113,8 @@ static void blk_flush_restore_request(struct request *rq) * original @rq->bio. Restore it. */ rq->bio = rq->biotail; + if (rq->bio) + rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; @@ -183,7 +168,7 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->queuelist, pending); + list_add_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: @@ -220,7 +205,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, struct list_head *running; struct request *rq, *n; unsigned long flags = 0; - struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -261,6 +246,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + list_del_init(&rq->queuelist); blk_flush_complete_seq(rq, fq, seq, error); } @@ -355,7 +341,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; - struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(ctx); if (q->elevator) { WARN_ON(rq->tag < 0); @@ -396,19 +382,32 @@ static void blk_rq_init_flush(struct request *rq) bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; - unsigned long fflags = q->queue_flags; /* may change, cache */ - unsigned int policy = blk_flush_policy(fflags, rq); - struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx); + bool supports_fua = q->limits.features & BLK_FEAT_FUA; + unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + + /* + * Check which flushes we need to sequence for this operation. + */ + if (blk_queue_write_cache(q)) { + if (rq->cmd_flags & REQ_PREFLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if ((rq->cmd_flags & REQ_FUA) && !supports_fua) + policy |= REQ_FSEQ_POSTFLUSH; + } + /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_PREFLUSH; - if (!(fflags & (1UL << QUEUE_FLAG_FUA))) + if (!supports_fua) rq->cmd_flags &= ~REQ_FUA; /* diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index c9eb4241e048..d479f5481b66 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -111,7 +111,6 @@ int disk_register_independent_access_ranges(struct gendisk *disk) struct request_queue *q = disk->queue; int i, ret; - lockdep_assert_held(&q->sysfs_dir_lock); lockdep_assert_held(&q->sysfs_lock); if (!iars) @@ -155,7 +154,6 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk) struct blk_independent_access_ranges *iars = disk->ia_ranges; int i; - lockdep_assert_held(&q->sysfs_dir_lock); lockdep_assert_held(&q->sysfs_lock); if (!iars) @@ -289,7 +287,6 @@ void disk_set_independent_access_ranges(struct gendisk *disk, { struct request_queue *q = disk->queue; - mutex_lock(&q->sysfs_dir_lock); mutex_lock(&q->sysfs_lock); if (iars && !disk_check_ia_ranges(disk, iars)) { kfree(iars); @@ -313,6 +310,5 @@ void disk_set_independent_access_ranges(struct gendisk *disk, disk_register_independent_access_ranges(disk); unlock: mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); } EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ccbeb6dfa87a..a1678f0a9f81 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -53,29 +53,28 @@ new_segment: return segments; } -EXPORT_SYMBOL(blk_rq_count_integrity_sg); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist - * @q: request queue - * @bio: bio with integrity metadata attached + * @rq: request to map * @sglist: target scatterlist * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all - * elements. I.e. sized using blk_rq_count_integrity_sg(). + * elements. I.e. sized using blk_rq_count_integrity_sg() or + * rq->nr_integrity_segments. */ -int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist) +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) { struct bio_vec iv, ivprv = { NULL }; + struct request_queue *q = rq->q; struct scatterlist *sg = NULL; + struct bio *bio = rq->bio; unsigned int segments = 0; struct bvec_iter iter; int prev = 0; bio_for_each_integrity_vec(iv, bio, iter) { - if (prev) { if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; @@ -103,63 +102,37 @@ new_segment: if (sg) sg_mark_end(sg); + /* + * Something must have been wrong if the figured number of segment + * is bigger than number of req's physical integrity segments + */ + BUG_ON(segments > rq->nr_integrity_segments); + BUG_ON(segments > queue_max_integrity_segments(q)); return segments; } EXPORT_SYMBOL(blk_rq_map_integrity_sg); -/** - * blk_integrity_compare - Compare integrity profile of two disks - * @gd1: Disk to compare - * @gd2: Disk to compare - * - * Description: Meta-devices like DM and MD need to verify that all - * sub-devices use the same integrity format before advertising to - * upper layers that they can send/receive integrity metadata. This - * function can be used to check whether two gendisk devices have - * compatible integrity formats. - */ -int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) +int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, + ssize_t bytes) { - struct blk_integrity *b1 = &gd1->queue->integrity; - struct blk_integrity *b2 = &gd2->queue->integrity; - - if (!b1->profile && !b2->profile) - return 0; - - if (!b1->profile || !b2->profile) - return -1; - - if (b1->interval_exp != b2->interval_exp) { - pr_err("%s: %s/%s protection interval %u != %u\n", - __func__, gd1->disk_name, gd2->disk_name, - 1 << b1->interval_exp, 1 << b2->interval_exp); - return -1; - } - - if (b1->tuple_size != b2->tuple_size) { - pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tuple_size, b2->tuple_size); - return -1; - } - - if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - pr_err("%s: %s/%s tag sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tag_size, b2->tag_size); - return -1; - } - - if (b1->profile != b2->profile) { - pr_err("%s: %s/%s type %s != %s\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->profile->name, b2->profile->name); - return -1; - } + int ret; + struct iov_iter iter; + unsigned int direction; + if (op_is_write(req_op(rq))) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + iov_iter_ubuf(&iter, direction, ubuf, bytes); + ret = bio_integrity_map_user(rq->bio, &iter); + if (ret) + return ret; + + rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, rq->bio); + rq->cmd_flags |= REQ_INTEGRITY; return 0; } -EXPORT_SYMBOL(blk_integrity_compare); +EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user); bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, struct request *next) @@ -188,7 +161,6 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, struct bio *bio) { int nr_integrity_segs; - struct bio *next = bio->bi_next; if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) return true; @@ -199,22 +171,72 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) return false; - bio->bi_next = NULL; nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); - bio->bi_next = next; - if (req->nr_integrity_segments + nr_integrity_segs > q->limits.max_integrity_segments) return false; - req->nr_integrity_segments += nr_integrity_segs; - return true; } static inline struct blk_integrity *dev_to_bi(struct device *dev) { - return &dev_to_disk(dev)->queue->integrity; + return &dev_to_disk(dev)->queue->limits.integrity; +} + +const char *blk_integrity_profile_name(struct blk_integrity *bi) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-IP"; + return "T10-DIF-TYPE3-IP"; + case BLK_INTEGRITY_CSUM_CRC: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-CRC"; + return "T10-DIF-TYPE3-CRC"; + case BLK_INTEGRITY_CSUM_CRC64: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "EXT-DIF-TYPE1-CRC64"; + return "EXT-DIF-TYPE3-CRC64"; + case BLK_INTEGRITY_CSUM_NONE: + break; + } + + return "nop"; +} +EXPORT_SYMBOL_GPL(blk_integrity_profile_name); + +static ssize_t flag_store(struct device *dev, const char *page, size_t count, + unsigned char flag) +{ + struct request_queue *q = dev_to_disk(dev)->queue; + struct queue_limits lim; + unsigned long val; + int err; + + err = kstrtoul(page, 10, &val); + if (err) + return err; + + /* note that the flags are inverted vs the values in the sysfs files */ + lim = queue_limits_start_update(q); + if (val) + lim.integrity.flags &= ~flag; + else + lim.integrity.flags |= flag; + + err = queue_limits_commit_update_frozen(q, &lim); + if (err) + return err; + return count; +} + +static ssize_t flag_show(struct device *dev, char *page, unsigned char flag) +{ + struct blk_integrity *bi = dev_to_bi(dev); + + return sysfs_emit(page, "%d\n", !(bi->flags & flag)); } static ssize_t format_show(struct device *dev, struct device_attribute *attr, @@ -222,9 +244,9 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr, { struct blk_integrity *bi = dev_to_bi(dev); - if (bi->profile && bi->profile->name) - return sysfs_emit(page, "%s\n", bi->profile->name); - return sysfs_emit(page, "none\n"); + if (!bi->tuple_size) + return sysfs_emit(page, "none\n"); + return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr, @@ -249,49 +271,26 @@ static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_VERIFY; - else - bi->flags &= ~BLK_INTEGRITY_VERIFY; - - return count; + return flag_store(dev, page, count, BLK_INTEGRITY_NOVERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY)); + return flag_show(dev, page, BLK_INTEGRITY_NOVERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_GENERATE; - else - bi->flags &= ~BLK_INTEGRITY_GENERATE; - - return count; + return flag_store(dev, page, count, BLK_INTEGRITY_NOGENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE)); + return flag_show(dev, page, BLK_INTEGRITY_NOGENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, @@ -325,81 +324,3 @@ const struct attribute_group blk_integrity_attr_group = { .name = "integrity", .attrs = integrity_attrs, }; - -static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) -{ - return BLK_STS_OK; -} - -static void blk_integrity_nop_prepare(struct request *rq) -{ -} - -static void blk_integrity_nop_complete(struct request *rq, - unsigned int nr_bytes) -{ -} - -static const struct blk_integrity_profile nop_profile = { - .name = "nop", - .generate_fn = blk_integrity_nop_fn, - .verify_fn = blk_integrity_nop_fn, - .prepare_fn = blk_integrity_nop_prepare, - .complete_fn = blk_integrity_nop_complete, -}; - -/** - * blk_integrity_register - Register a gendisk as being integrity-capable - * @disk: struct gendisk pointer to make integrity-aware - * @template: block integrity profile to register - * - * Description: When a device needs to advertise itself as being able to - * send/receive integrity metadata it must use this function to register - * the capability with the block layer. The template is a blk_integrity - * struct with values appropriate for the underlying hardware. See - * Documentation/block/data-integrity.rst. - */ -void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | - template->flags; - bi->interval_exp = template->interval_exp ? : - ilog2(queue_logical_block_size(disk->queue)); - bi->profile = template->profile ? template->profile : &nop_profile; - bi->tuple_size = template->tuple_size; - bi->tag_size = template->tag_size; - bi->pi_offset = template->pi_offset; - - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - -#ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->crypto_profile) { - pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - disk->queue->crypto_profile = NULL; - } -#endif -} -EXPORT_SYMBOL(blk_integrity_register); - -/** - * blk_integrity_unregister - Unregister block integrity profile - * @disk: disk whose integrity profile to unregister - * - * Description: This function unregisters the integrity capability from - * a block device. - */ -void blk_integrity_unregister(struct gendisk *disk) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->profile) - return; - - /* ensure all bios are off the integrity workqueue */ - blk_flush_integrity(); - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); - memset(bi, 0, sizeof(*bi)); -} -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 25dd4db11121..ce82770c72ab 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -32,13 +32,6 @@ static void get_io_context(struct io_context *ioc) atomic_long_inc(&ioc->refcount); } -static void icq_free_icq_rcu(struct rcu_head *head) -{ - struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); - - kmem_cache_free(icq->__rcu_icq_cache, icq); -} - /* * Exit an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. @@ -102,7 +95,7 @@ static void ioc_destroy_icq(struct io_cq *icq) */ icq->__rcu_icq_cache = et->icq_cache; icq->flags |= ICQ_DESTROYED; - call_rcu(&icq->__rcu_head, icq_free_icq_rcu); + kfree_rcu(icq, __rcu_head); } /* diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 690ca99dfaca..5bfd70311359 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -648,7 +648,7 @@ static const struct ioc_params autop[] = { * vrate adjust percentages indexed by ioc->busy_level. We adjust up on * vtime credit shortage and down on device saturation. */ -static u32 vrate_adj_pct[] = +static const u32 vrate_adj_pct[] = { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -1098,7 +1098,14 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, iocg->child_active_sum); } else { - inuse = clamp_t(u32, inuse, 1, active); + /* + * It may be tempting to turn this into a clamp expression with + * a lower limit of 1 but active may be 0, which cannot be used + * as an upper limit in that situation. This expression allows + * active to clamp inuse unless it is 0, in which case inuse + * becomes 1. + */ + inuse = min(inuse, active) ?: 1; } iocg->last_inuse = iocg->inuse; @@ -2076,7 +2083,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, struct ioc_now *now) { struct ioc_gq *iocg; - u64 dur, usage_pct, nr_cycles; + u64 dur, usage_pct, nr_cycles, nr_cycles_shift; /* if no debtor, reset the cycle */ if (!nr_debtors) { @@ -2138,10 +2145,12 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, old_debt = iocg->abs_vdebt; old_delay = iocg->delay; + nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); if (iocg->abs_vdebt) - iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; + if (iocg->delay) - iocg->delay = iocg->delay >> nr_cycles ?: 1; + iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; iocg_kick_waitq(iocg, true, now); @@ -2709,8 +2718,7 @@ retry_lock: * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); - wait.wait.private = current; + init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ @@ -2995,8 +3003,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); - hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->waitq_timer.function = iocg_waitq_timer_fn; + hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->level = blkg->blkcg->css.cgroup->level; @@ -3164,7 +3171,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, if (!dname) return 0; - spin_lock_irq(&ioc->lock); + spin_lock(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, @@ -3177,7 +3184,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); - spin_unlock_irq(&ioc->lock); + spin_unlock(&ioc->lock); return 0; } @@ -3215,13 +3222,16 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, u32 qos[NR_QOS_PARAMS]; bool enable, user; char *body, *p; + unsigned long memflags; int ret; blkg_conf_init(&ctx, input); - ret = blkg_conf_open_bdev(&ctx); - if (ret) + memflags = blkg_conf_open_bdev_frozen(&ctx); + if (IS_ERR_VALUE(memflags)) { + ret = memflags; goto err; + } body = ctx.body; disk = ctx.bdev->bd_disk; @@ -3238,7 +3248,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } - blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3338,19 +3347,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, wbt_enable_default(disk); blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue); - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return nbytes; einval: spin_unlock_irq(&ioc->lock); - blk_mq_unquiesce_queue(disk->queue); - blk_mq_unfreeze_queue(disk->queue); - ret = -EINVAL; err: - blkg_conf_exit(&ctx); + blkg_conf_exit_frozen(&ctx, memflags); return ret; } @@ -3364,14 +3369,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf, if (!dname) return 0; - spin_lock_irq(&ioc->lock); + spin_lock(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); - spin_unlock_irq(&ioc->lock); + spin_unlock(&ioc->lock); return 0; } @@ -3405,6 +3410,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, { struct blkg_conf_ctx ctx; struct request_queue *q; + unsigned int memflags; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; @@ -3432,7 +3438,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(q); } - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); spin_lock_irq(&ioc->lock); @@ -3484,7 +3490,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); blkg_conf_exit(&ctx); return nbytes; @@ -3493,7 +3499,7 @@ einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); ret = -EINVAL; err: diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index ebb522788d97..42c1e0b9a68f 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -749,9 +749,11 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { - blk_mq_freeze_queue(blkiolat->rqos.disk->queue); + unsigned int memflags; + + memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; - blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue); + blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags); } } diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 4051fada01f1..13659dc15c3f 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -50,14 +50,6 @@ static const char *policy_name[] = { static struct blkcg_policy ioprio_policy; /** - * struct ioprio_blkg - Per (cgroup, request queue) data. - * @pd: blkg_policy_data structure. - */ -struct ioprio_blkg { - struct blkg_policy_data pd; -}; - -/** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. @@ -67,11 +59,6 @@ struct ioprio_blkcg { enum prio_policy prio_policy; }; -static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; -} - static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), @@ -84,16 +71,6 @@ ioprio_blkcg_from_css(struct cgroup_subsys_state *css) return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } -static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) -{ - struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); - - if (!pd) - return NULL; - - return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); -} - static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); @@ -118,25 +95,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, return nbytes; } -static struct blkg_policy_data * -ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) -{ - struct ioprio_blkg *ioprio_blkg; - - ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); - if (!ioprio_blkg) - return NULL; - - return &ioprio_blkg->pd; -} - -static void ioprio_free_pd(struct blkg_policy_data *pd) -{ - struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); - - kfree(ioprio_blkg); -} - static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; @@ -155,38 +113,26 @@ static void ioprio_free_cpd(struct blkcg_policy_data *cpd) kfree(blkcg); } -#define IOPRIO_ATTRS \ - { \ - .name = "prio.class", \ - .seq_show = ioprio_show_prio_policy, \ - .write = ioprio_set_prio_policy, \ - }, \ - { } /* sentinel */ - -/* cgroup v2 attributes */ static struct cftype ioprio_files[] = { - IOPRIO_ATTRS -}; - -/* cgroup v1 attributes */ -static struct cftype ioprio_legacy_files[] = { - IOPRIO_ATTRS + { + .name = "prio.class", + .seq_show = ioprio_show_prio_policy, + .write = ioprio_set_prio_policy, + }, + { } /* sentinel */ }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, - .legacy_cftypes = ioprio_legacy_files, + .legacy_cftypes = ioprio_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, - - .pd_alloc_fn = ioprio_alloc_pd, - .pd_free_fn = ioprio_free_pd, }; void blkcg_set_ioprio(struct bio *bio) { - struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) @@ -219,16 +165,6 @@ void blkcg_set_ioprio(struct bio *bio) bio->bi_ioprio = prio; } -void blk_ioprio_exit(struct gendisk *disk) -{ - blkcg_deactivate_policy(disk, &ioprio_policy); -} - -int blk_ioprio_init(struct gendisk *disk) -{ - return blkcg_activate_policy(disk, &ioprio_policy); -} - static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index b6afb8e80de0..9265143f9bc9 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -9,17 +9,8 @@ struct request_queue; struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO -int blk_ioprio_init(struct gendisk *disk); -void blk_ioprio_exit(struct gendisk *disk); void blkcg_set_ioprio(struct bio *bio); #else -static inline int blk_ioprio_init(struct gendisk *disk) -{ - return 0; -} -static inline void blk_ioprio_exit(struct gendisk *disk) -{ -} static inline void blkcg_set_ioprio(struct bio *bio) { } diff --git a/block/blk-lib.c b/block/blk-lib.c index a6954eafb8c8..4c9f20a689f7 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -35,51 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) +struct bio *blk_alloc_discard_bio(struct block_device *bdev, + sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask) { - struct bio *bio = *biop; - sector_t bs_mask; - - if (bdev_read_only(bdev)) - return -EPERM; - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; - - /* In case the discard granularity isn't set by buggy device driver */ - if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { - pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n", - bdev); - return -EOPNOTSUPP; - } - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; + sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector)); + struct bio *bio; - if (!nr_sects) - return -EINVAL; + if (!bio_sects) + return NULL; - while (nr_sects) { - sector_t req_sects = - min(nr_sects, bio_discard_limit(bdev, sector)); + bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask); + if (!bio) + return NULL; + bio->bi_iter.bi_sector = *sector; + bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT; + *sector += bio_sects; + *nr_sects -= bio_sects; + /* + * We can loop for a long time in here if someone does full device + * discards (like mkfs). Be nice and allow us to schedule out to avoid + * softlocking if preempt is disabled. + */ + cond_resched(); + return bio; +} - bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); - bio->bi_iter.bi_sector = sector; - bio->bi_iter.bi_size = req_sects << 9; - sector += req_sects; - nr_sects -= req_sects; - - /* - * We can loop for a long time in here, if someone does - * full device discards (like mkfs). Be nice and allow - * us to schedule out to avoid softlocking if preempt - * is disabled. - */ - cond_resched(); - } +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) +{ + struct bio *bio; - *biop = bio; + while ((bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, + gfp_mask))) + *biop = bio_chain_and_submit(*biop, bio); return 0; } EXPORT_SYMBOL(__blkdev_issue_discard); @@ -115,38 +103,80 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); -static int __blkdev_issue_write_zeroes(struct block_device *bdev, - sector_t sector, sector_t nr_sects, gfp_t gfp_mask, - struct bio **biop, unsigned flags) +static sector_t bio_write_zeroes_limit(struct block_device *bdev) { - struct bio *bio = *biop; - unsigned int max_sectors; - - if (bdev_read_only(bdev)) - return -EPERM; + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - /* Ensure that max_sectors doesn't overflow bi_size */ - max_sectors = bdev_write_zeroes_sectors(bdev); + return min(bdev_write_zeroes_sectors(bdev), + (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask); +} - if (max_sectors == 0) - return -EOPNOTSUPP; +/* + * There is no reliable way for the SCSI subsystem to determine whether a + * device supports a WRITE SAME operation without actually performing a write + * to media. As a result, write_zeroes is enabled by default and will be + * disabled if a zeroing operation subsequently fails. This means that this + * queue limit is likely to change at runtime. + */ +static void __blkdev_issue_write_zeroes(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop, unsigned flags, sector_t limit) +{ while (nr_sects) { - unsigned int len = min_t(sector_t, nr_sects, max_sectors); + unsigned int len = min(nr_sects, limit); + struct bio *bio; + + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) + break; - bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); + bio = bio_alloc(bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; bio->bi_iter.bi_size = len << SECTOR_SHIFT; + *biop = bio_chain_and_submit(*biop, bio); + nr_sects -= len; sector += len; cond_resched(); } +} - *biop = bio; - return 0; +static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp, unsigned flags) +{ + sector_t limit = bio_write_zeroes_limit(bdev); + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + blk_start_plug(&plug); + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, + flags, limit); + if (bio) { + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) { + bio_await_chain(bio); + blk_finish_plug(&plug); + return -EINTR; + } + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + /* + * For some devices there is no non-destructive way to verify whether + * WRITE ZEROES is actually supported. These will clear the capability + * on an I/O error, in which case we'll turn any error into + * "not supported" here. + */ + if (ret && !bdev_write_zeroes_sectors(bdev)) + return -EOPNOTSUPP; + return ret; } /* @@ -162,35 +192,63 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) return min(pages, (sector_t)BIO_MAX_VECS); } -static int __blkdev_issue_zero_pages(struct block_device *bdev, +static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, - struct bio **biop) + struct bio **biop, unsigned int flags) { - struct bio *bio = *biop; - int bi_size = 0; - unsigned int sz; - - if (bdev_read_only(bdev)) - return -EPERM; + while (nr_sects) { + unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); + struct bio *bio; - while (nr_sects != 0) { - bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects), - REQ_OP_WRITE, gfp_mask); + bio = bio_alloc(bdev, nr_vecs, REQ_OP_WRITE, gfp_mask); bio->bi_iter.bi_sector = sector; - while (nr_sects != 0) { - sz = min((sector_t) PAGE_SIZE, nr_sects << 9); - bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); - nr_sects -= bi_size >> 9; - sector += bi_size >> 9; - if (bi_size < sz) + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) + break; + + do { + unsigned int len, added; + + len = min_t(sector_t, + PAGE_SIZE, nr_sects << SECTOR_SHIFT); + added = bio_add_page(bio, ZERO_PAGE(0), len, 0); + if (added < len) break; - } + nr_sects -= added >> SECTOR_SHIFT; + sector += added >> SECTOR_SHIFT; + } while (nr_sects); + + *biop = bio_chain_and_submit(*biop, bio); cond_resched(); } +} - *biop = bio; - return 0; +static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp, unsigned flags) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + if (flags & BLKDEV_ZERO_NOFALLBACK) + return -EOPNOTSUPP; + + blk_start_plug(&plug); + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags); + if (bio) { + if ((flags & BLKDEV_ZERO_KILLABLE) && + fatal_signal_pending(current)) { + bio_await_chain(bio); + blk_finish_plug(&plug); + return -EINTR; + } + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + return ret; } /** @@ -216,20 +274,21 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { - int ret; - sector_t bs_mask; + sector_t limit = bio_write_zeroes_limit(bdev); - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; - - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, - biop, flags); - if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) - return ret; + if (bdev_read_only(bdev)) + return -EPERM; - return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, - biop); + if (limit) { + __blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, biop, flags, limit); + } else { + if (flags & BLKDEV_ZERO_NOFALLBACK) + return -EOPNOTSUPP; + __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + biop, flags); + } + return 0; } EXPORT_SYMBOL(__blkdev_issue_zeroout); @@ -249,51 +308,21 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { - int ret = 0; - sector_t bs_mask; - struct bio *bio; - struct blk_plug plug; - bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); + int ret; - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) + if ((sector | nr_sects) & ((bdev_logical_block_size(bdev) >> 9) - 1)) return -EINVAL; + if (bdev_read_only(bdev)) + return -EPERM; -retry: - bio = NULL; - blk_start_plug(&plug); - if (try_write_zeroes) { - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, - gfp_mask, &bio, flags); - } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { - ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, - gfp_mask, &bio); - } else { - /* No zeroing offload support */ - ret = -EOPNOTSUPP; - } - if (ret == 0 && bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - blk_finish_plug(&plug); - if (ret && try_write_zeroes) { - if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { - try_write_zeroes = false; - goto retry; - } - if (!bdev_write_zeroes_sectors(bdev)) { - /* - * Zeroing offload support was indicated, but the - * device reported ILLEGAL REQUEST (for some devices - * there is no non-destructive way to verify whether - * WRITE ZEROES is actually supported). - */ - ret = -EOPNOTSUPP; - } + if (bdev_write_zeroes_sectors(bdev)) { + ret = blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, flags); + if (ret != -EOPNOTSUPP) + return ret; } - return ret; + return blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, flags); } EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-map.c b/block/blk-map.c index 71210cdb3442..23e5d5ebe59e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -189,7 +189,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, } } - if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { + if (bio_add_page(bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; @@ -272,86 +272,27 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { - iov_iter_extraction_t extraction_flags = 0; - unsigned int max_sectors = queue_max_hw_sectors(rq->q); unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; - int j; if (!iov_iter_count(iter)) return -EINVAL; bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); - if (bio == NULL) + if (!bio) return -ENOMEM; - - if (blk_queue_pci_p2pdma(rq->q)) - extraction_flags |= ITER_ALLOW_P2PDMA; - if (iov_iter_extract_will_pin(iter)) - bio_set_flag(bio, BIO_PAGE_PINNED); - - while (iov_iter_count(iter)) { - struct page *stack_pages[UIO_FASTIOV]; - struct page **pages = stack_pages; - ssize_t bytes; - size_t offs; - int npages; - - if (nr_vecs > ARRAY_SIZE(stack_pages)) - pages = NULL; - - bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, - nr_vecs, extraction_flags, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(rq->q))) - j = 0; - else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - bool same_page = false; - - if (n > bytes) - n = bytes; - - if (!bio_add_hw_page(rq->q, bio, page, n, offs, - max_sectors, &same_page)) - break; - - if (same_page) - bio_release_page(bio, page); - bytes -= n; - offs = 0; - } - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - bio_release_page(bio, pages[j++]); - if (pages != stack_pages) - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) { - iov_iter_revert(iter, bytes); - break; - } - } - + ret = bio_iov_iter_get_pages(bio, iter); + if (ret) + goto out_put; ret = blk_rq_append_bio(rq, bio); if (ret) - goto out_unmap; + goto out_release; return 0; - out_unmap: +out_release: bio_release_pages(bio, false); +out_put: blk_mq_map_bio_put(bio); return ret; } @@ -376,65 +317,26 @@ static void bio_map_kern_endio(struct bio *bio) kfree(bio); } -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -static struct bio *bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) +static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, + gfp_t gfp_mask) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - bool is_vmalloc = is_vmalloc_addr(data); - struct page *page; - int offset, i; + unsigned int nr_vecs = bio_add_max_vecs(data, len); struct bio *bio; - bio = bio_kmalloc(nr_pages, gfp_mask); + bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); - - if (is_vmalloc) { - flush_kernel_vmap_range(data, len); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + if (is_vmalloc_addr(data)) { bio->bi_private = data; - } - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (!is_vmalloc) - page = virt_to_page(data); - else - page = vmalloc_to_page(data); - if (bio_add_pc_page(q, bio, page, bytes, - offset) < bytes) { - /* we don't support partial mappings */ + if (!bio_add_vmalloc(bio, data, len)) { bio_uninit(bio); kfree(bio); return ERR_PTR(-EINVAL); } - - data += bytes; - len -= bytes; - offset = 0; + } else { + bio_add_virt_nofail(bio, data, len); } - bio->bi_end_io = bio_map_kern_endio; return bio; } @@ -462,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio) /** * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio * @data: pointer to buffer to copy * @len: length in bytes + * @op: bio/request operation * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ * * copy the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ -static struct bio *bio_copy_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask, int reading) +static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, + gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -491,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); while (len) { struct page *page; @@ -504,21 +405,21 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (!page) goto cleanup; - if (!reading) + if (op_is_write(op)) memcpy(page_address(page), p, bytes); - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + if (bio_add_page(bio, page, bytes, 0) < bytes) break; len -= bytes; p += bytes; } - if (reading) { + if (op_is_write(op)) { + bio->bi_end_io = bio_copy_kern_endio; + } else { bio->bi_end_io = bio_copy_kern_endio_read; bio->bi_private = data; - } else { - bio->bi_end_io = bio_copy_kern_endio; } return bio; @@ -536,24 +437,33 @@ cleanup: */ int blk_rq_append_bio(struct request *rq, struct bio *bio) { - struct bvec_iter iter; - struct bio_vec bv; + const struct queue_limits *lim = &rq->q->limits; + unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; unsigned int nr_segs = 0; + int ret; - bio_for_each_bvec(bv, bio, iter) - nr_segs++; + /* check that the data layout matches the hardware restrictions */ + ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + if (ret) { + /* if we would have to split the bio, copy instead */ + if (ret > 0) + ret = -EREMOTEIO; + return ret; + } - if (!rq->bio) { - blk_rq_bio_prep(rq, bio, nr_segs); - } else { + if (rq->bio) { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; rq->biotail->bi_next = bio; rq->biotail = bio; - rq->__data_len += (bio)->bi_iter.bi_size; + rq->__data_len += bio->bi_iter.bi_size; bio_crypt_free_ctx(bio); + return 0; } + rq->nr_phys_segments = nr_segs; + rq->bio = rq->biotail = bio; + rq->__data_len = bio->bi_iter.bi_size; return 0; } EXPORT_SYMBOL(blk_rq_append_bio); @@ -561,57 +471,23 @@ EXPORT_SYMBOL(blk_rq_append_bio); /* Prepare bio for passthrough IO given ITER_BVEC iter */ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { - struct request_queue *q = rq->q; - size_t nr_iter = iov_iter_count(iter); - size_t nr_segs = iter->nr_segs; - struct bio_vec *bvecs, *bvprvp = NULL; - const struct queue_limits *lim = &q->limits; - unsigned int nsegs = 0, bytes = 0; + unsigned int max_bytes = rq->q->limits.max_hw_sectors << SECTOR_SHIFT; struct bio *bio; - size_t i; + int ret; - if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) - return -EINVAL; - if (nr_segs > queue_max_segments(q)) + if (!iov_iter_count(iter) || iov_iter_count(iter) > max_bytes) return -EINVAL; - /* no iovecs to alloc, as we already have a BVEC iterator */ + /* reuse the bvecs from the iterator instead of allocating new ones */ bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); - if (bio == NULL) + if (!bio) return -ENOMEM; + bio_iov_bvec_set(bio, iter); - bio_iov_bvec_set(bio, (struct iov_iter *)iter); - blk_rq_bio_prep(rq, bio, nr_segs); - - /* loop to perform a bunch of sanity checks */ - bvecs = (struct bio_vec *)iter->bvec; - for (i = 0; i < nr_segs; i++) { - struct bio_vec *bv = &bvecs[i]; - - /* - * If the queue doesn't support SG gaps and adding this - * offset would create a gap, fallback to copy. - */ - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { - blk_mq_map_bio_put(bio); - return -EREMOTEIO; - } - /* check full condition */ - if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) - goto put_bio; - if (bytes + bv->bv_len > nr_iter) - goto put_bio; - if (bv->bv_offset + bv->bv_len > PAGE_SIZE) - goto put_bio; - - nsegs++; - bytes += bv->bv_len; - bvprvp = bv; - } - return 0; -put_bio: - blk_mq_map_bio_put(bio); - return -EINVAL; + ret = blk_rq_append_bio(rq, bio); + if (ret) + blk_mq_map_bio_put(bio); + return ret; } /** @@ -634,15 +510,13 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, const struct iov_iter *iter, gfp_t gfp_mask) { bool copy = false, map_bvec = false; - unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); + unsigned long align = blk_lim_dma_alignment_and_pad(&q->limits); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; if (map_data) copy = true; - else if (blk_queue_may_bounce(q)) - copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (iov_iter_is_bvec(iter)) @@ -668,8 +542,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); else ret = bio_map_user_iov(rq, &i, gfp_mask); - if (ret) + if (ret) { + if (ret == -EREMOTEIO) + ret = -EINVAL; goto unmap_rq; + } if (!bio) bio = rq->bio; } while (iov_iter_count(&i)); @@ -757,6 +634,9 @@ int blk_rq_unmap_user(struct bio *bio) bio_release_pages(bio, bio_data_dir(bio) == READ); } + if (bio_integrity(bio)) + bio_integrity_unmap_user(bio); + next_bio = bio; bio = bio->bi_next; blk_mq_map_bio_put(next_bio); @@ -768,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user); /** * blk_rq_map_kern - map kernel data to a request, for passthrough requests - * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer * @len: length of user data @@ -779,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user); * buffer is used. Can be called multiple times to append multiple * buffers. */ -int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, - unsigned int len, gfp_t gfp_mask) +int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, + gfp_t gfp_mask) { - int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; struct bio *bio; int ret; - if (len > (queue_max_hw_sectors(q) << 9)) + if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT)) return -EINVAL; if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || - blk_queue_may_bounce(q)) - bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); + if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf)) + bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask); else - bio = bio_map_kern(q, kbuf, len, gfp_mask); + bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); - bio->bi_opf &= ~REQ_OP_MASK; - bio->bi_opf |= req_op(rq); - ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { bio_uninit(bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4e3483a16b75..3af1d284add5 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,7 +7,6 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> -#include <linux/scatterlist.h> #include <linux/part_stat.h> #include <linux/blk-cgroup.h> @@ -105,9 +104,38 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } -static struct bio *bio_split_discard(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +static struct bio *bio_submit_split(struct bio *bio, int split_sectors) +{ + if (unlikely(split_sectors < 0)) + goto error; + + if (split_sectors) { + struct bio *split; + + split = bio_split(bio, split_sectors, GFP_NOIO, + &bio->bi_bdev->bd_disk->bio_split); + if (IS_ERR(split)) { + split_sectors = PTR_ERR(split); + goto error; + } + split->bi_opf |= REQ_NOMERGE; + blkcg_bio_issue_init(split); + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + return split; + } + + return bio; +error: + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; +} + +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -121,10 +149,10 @@ static struct bio *bio_split_discard(struct bio *bio, min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) - return NULL; + return bio; if (bio_sectors(bio) <= max_discard_sectors) - return NULL; + return bio; split_sectors = max_discard_sectors; @@ -139,19 +167,20 @@ static struct bio *bio_split_discard(struct bio *bio, if (split_sectors > tmp) split_sectors -= tmp; - return bio_split(bio, split_sectors, GFP_NOIO, bs); + return bio_submit_split(bio, split_sectors); } -static struct bio *bio_split_write_zeroes(struct bio *bio, - const struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, + bool is_atomic) { - *nsegs = 0; - if (!lim->max_write_zeroes_sectors) - return NULL; - if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) - return NULL; - return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); + /* + * chunk_sectors must be a multiple of atomic_write_boundary_sectors if + * both non-zero. + */ + if (is_atomic && lim->atomic_write_boundary_sectors) + return lim->atomic_write_boundary_sectors; + + return lim->chunk_sectors; } /* @@ -167,12 +196,25 @@ static inline unsigned get_max_io_size(struct bio *bio, { unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; - unsigned max_sectors = lim->max_sectors, start, end; + bool is_atomic = bio->bi_opf & REQ_ATOMIC; + unsigned boundary_sectors = blk_boundary_sectors(lim, is_atomic); + unsigned max_sectors, start, end; + + /* + * We ignore lim->max_sectors for atomic writes because it may less + * than the actual bio size, which we cannot tolerate. + */ + if (bio_op(bio) == REQ_OP_WRITE_ZEROES) + max_sectors = lim->max_write_zeroes_sectors; + else if (is_atomic) + max_sectors = lim->atomic_write_max_sectors; + else + max_sectors = lim->max_sectors; - if (lim->chunk_sectors) { + if (boundary_sectors) { max_sectors = min(max_sectors, - blk_chunk_sectors_left(bio->bi_iter.bi_sector, - lim->chunk_sectors)); + blk_boundary_sectors_left(bio->bi_iter.bi_sector, + boundary_sectors)); } start = bio->bi_iter.bi_sector & (pbs - 1); @@ -183,28 +225,6 @@ static inline unsigned get_max_io_size(struct bio *bio, } /** - * get_max_segment_size() - maximum number of bytes to add as a single segment - * @lim: Request queue limits. - * @start_page: See below. - * @offset: Offset from @start_page where to add a segment. - * - * Returns the maximum number of bytes that can be added as a single segment. - */ -static inline unsigned get_max_segment_size(const struct queue_limits *lim, - struct page *start_page, unsigned long offset) -{ - unsigned long mask = lim->seg_boundary_mask; - - offset = mask & (page_to_phys(start_page) + offset); - - /* - * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 - * after having calculated the minimum. - */ - return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1; -} - -/** * bvec_split_segs - verify whether or not a bvec should be split in the middle * @lim: [in] queue limits to split based on * @bv: [in] bvec to examine @@ -228,15 +248,13 @@ static bool bvec_split_segs(const struct queue_limits *lim, const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes, unsigned max_segs, unsigned max_bytes) { - unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; + unsigned max_len = max_bytes - *bytes; unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; unsigned seg_size = 0; while (len && *nsegs < max_segs) { - seg_size = get_max_segment_size(lim, bv->bv_page, - bv->bv_offset + total_len); - seg_size = min(seg_size, len); + seg_size = get_max_segment_size(lim, bvec_phys(bv) + total_len, len); (*nsegs)++; total_len += seg_size; @@ -252,28 +270,28 @@ static bool bvec_split_segs(const struct queue_limits *lim, return len > 0 || bv->bv_len > max_len; } +static unsigned int bio_split_alignment(struct bio *bio, + const struct queue_limits *lim) +{ + if (op_is_write(bio_op(bio)) && lim->zone_write_granularity) + return lim->zone_write_granularity; + return lim->logical_block_size; +} + /** - * bio_split_rw - split a bio in two bios + * bio_split_rw_at - check if and where to split a read/write bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors - * @bs: [in] bio set to allocate the clone from * @max_bytes: [in] maximum number of bytes per bio * - * Clone @bio, update the bi_iter of the clone to represent the first sectors - * of @bio and update @bio->bi_iter to represent the remaining sectors. The - * following is guaranteed for the cloned bio: - * - That it has at most @max_bytes worth of data - * - That it has at most queue_max_segments(@q) segments. - * - * Except for discard requests the cloned bio will point at the bi_io_vec of - * the original bio. It is the responsibility of the caller to ensure that the - * original bio is not freed before the cloned bio. The caller is also - * responsible for ensuring that @bs is only destroyed after processing of the - * split bio has finished. + * Find out if @bio needs to be split to fit the queue limits in @lim and a + * maximum size of @max_bytes. Returns a negative error number if @bio can't be + * split, 0 if the bio doesn't have to be split, or a positive sector offset if + * @bio needs to be split. */ -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes) +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; @@ -289,7 +307,7 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + bv.bv_offset + bv.bv_len <= lim->min_segment_size) { nsegs++; bytes += bv.bv_len; } else { @@ -303,17 +321,17 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, } *segs = nsegs; - return NULL; + return 0; split: + if (bio->bi_opf & REQ_ATOMIC) + return -EINVAL; + /* * We can't sanely support splitting for a REQ_NOWAIT bio. End it * with EAGAIN if splitting is required and return an error pointer. */ - if (bio->bi_opf & REQ_NOWAIT) { - bio->bi_status = BLK_STS_AGAIN; - bio_endio(bio); - return ERR_PTR(-EAGAIN); - } + if (bio->bi_opf & REQ_NOWAIT) + return -EAGAIN; *segs = nsegs; @@ -322,7 +340,7 @@ split: * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. */ - bytes = ALIGN_DOWN(bytes, lim->logical_block_size); + bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); /* * Bio splitting may cause subtle trouble such as hang when doing sync @@ -330,57 +348,55 @@ split: * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); - return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); + return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw); +EXPORT_SYMBOL_GPL(bio_split_rw_at); -/** - * __bio_split_to_limits - split a bio to fit the queue limits - * @bio: bio to be split - * @lim: queue limits to split based on - * @nr_segs: returns the number of segments in the returned bio - * - * Check if @bio needs splitting based on the queue limits, and if so split off - * a bio fitting the limits from the beginning of @bio and return it. @bio is - * shortened to the remainder and re-submitted. +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs) +{ + return bio_submit_split(bio, + bio_split_rw_at(bio, lim, nr_segs, + get_max_io_size(bio, lim) << SECTOR_SHIFT)); +} + +/* + * REQ_OP_ZONE_APPEND bios must never be split by the block layer. * - * The split bio is allocated from @q->bio_split, which is provided by the - * block layer. + * But we want the nr_segs calculation provided by bio_split_rw_at, and having + * a good sanity check that the submitter built the bio correctly is nice to + * have as well. */ -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs) +struct bio *bio_split_zone_append(struct bio *bio, + const struct queue_limits *lim, unsigned *nr_segs) { - struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; - struct bio *split; + int split_sectors; - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - split = bio_split_discard(bio, lim, nr_segs, bs); - break; - case REQ_OP_WRITE_ZEROES: - split = bio_split_write_zeroes(bio, lim, nr_segs, bs); - break; - default: - split = bio_split_rw(bio, lim, nr_segs, bs, - get_max_io_size(bio, lim) << SECTOR_SHIFT); - if (IS_ERR(split)) - return NULL; - break; - } + split_sectors = bio_split_rw_at(bio, lim, nr_segs, + lim->max_zone_append_sectors << SECTOR_SHIFT); + if (WARN_ON_ONCE(split_sectors > 0)) + split_sectors = -EINVAL; + return bio_submit_split(bio, split_sectors); +} - if (split) { - /* there isn't chance to merge the split bio */ - split->bi_opf |= REQ_NOMERGE; +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs) +{ + unsigned int max_sectors = get_max_io_size(bio, lim); - blkcg_bio_issue_init(split); - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - return split; - } - return bio; + *nsegs = 0; + + /* + * An unset limit should normally not happen, as bio submission is keyed + * off having a non-zero limit. But SCSI can clear the limit in the + * I/O completion handler, and we can race and see this. Splitting to a + * zero limit obviously doesn't make sense, so band-aid it here. + */ + if (!max_sectors) + return bio; + if (bio_sectors(bio) <= max_sectors) + return bio; + return bio_submit_split(bio, max_sectors); } /** @@ -396,12 +412,9 @@ struct bio *__bio_split_to_limits(struct bio *bio, */ struct bio *bio_split_to_limits(struct bio *bio) { - const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; - if (bio_may_exceed_limits(bio, lim)) - return __bio_split_to_limits(bio, lim, &nr_segs); - return bio; + return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs); } EXPORT_SYMBOL(bio_split_to_limits); @@ -438,167 +451,26 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return nr_phys_segs; } -static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, - struct scatterlist *sglist) -{ - if (!*sg) - return sglist; - - /* - * If the driver previously mapped a shorter list, we could see a - * termination bit prematurely unless it fully inits the sg table - * on each mapping. We KNOW that there must be more entries here - * or the driver would be buggy, so force clear the termination bit - * to avoid doing a full sg_init_table() in drivers for each command. - */ - sg_unmark_end(*sg); - return sg_next(*sg); -} - -static unsigned blk_bvec_map_sg(struct request_queue *q, - struct bio_vec *bvec, struct scatterlist *sglist, - struct scatterlist **sg) -{ - unsigned nbytes = bvec->bv_len; - unsigned nsegs = 0, total = 0; - - while (nbytes > 0) { - unsigned offset = bvec->bv_offset + total; - unsigned len = min(get_max_segment_size(&q->limits, - bvec->bv_page, offset), nbytes); - struct page *page = bvec->bv_page; - - /* - * Unfortunately a fair number of drivers barf on scatterlists - * that have an offset larger than PAGE_SIZE, despite other - * subsystems dealing with that invariant just fine. For now - * stick to the legacy format where we never present those from - * the block layer, but the code below should be removed once - * these offenders (mostly MMC/SD drivers) are fixed. - */ - page += (offset >> PAGE_SHIFT); - offset &= ~PAGE_MASK; - - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, page, len, offset); - - total += len; - nbytes -= len; - nsegs++; - } - - return nsegs; -} - -static inline int __blk_bvec_map_sg(struct bio_vec bv, - struct scatterlist *sglist, struct scatterlist **sg) -{ - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); - return 1; -} - -/* only try to merge bvecs into one sg if they are from two bios */ -static inline bool -__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec, - struct bio_vec *bvprv, struct scatterlist **sg) -{ - - int nbytes = bvec->bv_len; - - if (!*sg) - return false; - - if ((*sg)->length + nbytes > queue_max_segment_size(q)) - return false; - - if (!biovec_phys_mergeable(q, bvprv, bvec)) - return false; - - (*sg)->length += nbytes; - - return true; -} - -static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist, - struct scatterlist **sg) -{ - struct bio_vec bvec, bvprv = { NULL }; - struct bvec_iter iter; - int nsegs = 0; - bool new_bio = false; - - for_each_bio(bio) { - bio_for_each_bvec(bvec, bio, iter) { - /* - * Only try to merge bvecs from two bios given we - * have done bio internal merge when adding pages - * to bio - */ - if (new_bio && - __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg)) - goto next_bvec; - - if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) - nsegs += __blk_bvec_map_sg(bvec, sglist, sg); - else - nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg); - next_bvec: - new_bio = false; - } - if (likely(bio->bi_iter.bi_size)) { - bvprv = bvec; - new_bio = true; - } - } - - return nsegs; -} - -/* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries - */ -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg) -{ - int nsegs = 0; - - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); - else if (rq->bio) - nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); - - if (*last_sg) - sg_mark_end(*last_sg); - - /* - * Something must have been wrong if the figured number of - * segment is bigger than number of req's physical segments - */ - WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); - - return nsegs; -} -EXPORT_SYMBOL(__blk_rq_map_sg); - static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { struct request_queue *q = rq->q; - unsigned int max_sectors; + struct queue_limits *lim = &q->limits; + unsigned int max_sectors, boundary_sectors; + bool is_atomic = rq->cmd_flags & REQ_ATOMIC; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; - max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); - if (!q->limits.chunk_sectors || + boundary_sectors = blk_boundary_sectors(lim, is_atomic); + max_sectors = blk_queue_get_max_sectors(rq); + + if (!boundary_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) return max_sectors; return min(max_sectors, - blk_chunk_sectors_left(offset, q->limits.chunk_sectors)); + blk_boundary_sectors_left(offset, boundary_sectors)); } static inline int ll_new_hw_segment(struct request *req, struct bio *bio, @@ -622,6 +494,9 @@ static inline int ll_new_hw_segment(struct request *req, struct bio *bio, * counters. */ req->nr_phys_segments += nr_phys_segs; + if (bio_integrity(bio)) + req->nr_integrity_segments += blk_rq_count_integrity_sg(req->q, + bio); return 1; no_merge: @@ -714,6 +589,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; + req->nr_integrity_segments += next->nr_integrity_segments; return 1; } @@ -776,9 +652,11 @@ static inline void blk_update_mixed_merge(struct request *req, static void blk_account_io_merge_request(struct request *req) { - if (blk_do_io_stat(req)) { + if (req->rq_flags & RQF_IO_STAT) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } @@ -794,6 +672,18 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static bool blk_atomic_write_mergeable_rq_bio(struct request *rq, + struct bio *bio) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC); +} + +static bool blk_atomic_write_mergeable_rqs(struct request *rq, + struct request *next) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -807,14 +697,13 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - if (rq_data_dir(req) != rq_data_dir(next)) + if (req->bio->bi_write_hint != next->bio->bi_write_hint) return NULL; - - /* Don't merge requests with different write hints. */ - if (req->write_hint != next->write_hint) + if (req->bio->bi_write_stream != next->bio->bi_write_stream) return NULL; - - if (req->ioprio != next->ioprio) + if (req->bio->bi_ioprio != next->bio->bi_ioprio) + return NULL; + if (!blk_atomic_write_mergeable_rqs(req, next)) return NULL; /* @@ -925,27 +814,19 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (req_op(rq) != bio_op(bio)) return false; - /* different data direction or already started, don't merge */ - if (bio_data_dir(bio) != rq_data_dir(rq)) - return false; - - /* don't merge across cgroup boundaries */ if (!blk_cgroup_mergeable(rq, bio)) return false; - - /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; - - /* Only merge if the crypt contexts are compatible */ if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; - - /* Don't merge requests with different write hints. */ - if (rq->write_hint != bio->bi_write_hint) + if (rq->bio->bi_write_hint != bio->bi_write_hint) return false; - - if (rq->ioprio != bio_prio(bio)) + if (rq->bio->bi_write_stream != bio->bi_write_stream) + return false; + if (rq->bio->bi_ioprio != bio->bi_ioprio) + return false; + if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) return false; return true; @@ -964,21 +845,14 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) static void blk_account_io_merge_bio(struct request *req) { - if (!blk_do_io_stat(req)) - return; - - part_stat_lock(); - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); - part_stat_unlock(); + if (req->rq_flags & RQF_IO_STAT) { + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); + } } -enum bio_merge_status { - BIO_MERGE_OK, - BIO_MERGE_NONE, - BIO_MERGE_FAILED, -}; - -static enum bio_merge_status bio_attempt_back_merge(struct request *req, +enum bio_merge_status bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs) { const blk_opf_t ff = bio_failfast(bio); @@ -994,6 +868,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, blk_update_mixed_merge(req, bio, false); + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) + blk_zone_write_plug_bio_merged(bio); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -1009,6 +886,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, { const blk_opf_t ff = bio_failfast(bio); + /* + * A front merge for writes to sequential zones of a zoned block device + * can happen only if the user submitted writes out of order. Do not + * merge such write to let it fail. + */ + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) + return BIO_MERGE_FAILED; + if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; @@ -1107,11 +992,10 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct blk_plug *plug; + struct blk_plug *plug = current->plug; struct request *rq; - plug = blk_mq_plug(bio); - if (!plug || rq_list_empty(plug->mq_list)) + if (!plug || rq_list_empty(&plug->mq_list)) return false; rq_list_for_each(&plug->mq_list, rq) { diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 9638b25fd521..444798c5374f 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/group_cpus.h> +#include <linux/device/bus.h> #include "blk.h" #include "blk-mq.h" @@ -54,3 +55,38 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) return NUMA_NO_NODE; } + +/** + * blk_mq_map_hw_queues - Create CPU to hardware queue mapping + * @qmap: CPU to hardware queue map + * @dev: The device to map queues + * @offset: Queue offset to use for the device + * + * Create a CPU to hardware queue mapping in @qmap. The struct bus_type + * irq_get_affinity callback will be used to retrieve the affinity. + */ +void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, + struct device *dev, unsigned int offset) + +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + if (!dev->bus->irq_get_affinity) + goto fallback; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = dev->bus->irq_get_affinity(dev, queue + offset); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return; + +fallback: + blk_mq_map_queues(qmap); +} +EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues); diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c deleted file mode 100644 index a77b099c34b7..000000000000 --- a/block/blk-mq-debugfs-zoned.c +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2017 Western Digital Corporation or its affiliates. - */ - -#include <linux/blkdev.h> -#include "blk-mq-debugfs.h" - -int queue_zone_wlock_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - unsigned int i; - - if (!q->disk->seq_zones_wlock) - return 0; - - for (i = 0; i < q->disk->nr_zones; i++) - if (test_bit(i, q->disk->seq_zones_wlock)) - seq_printf(m, "%u\n", i); - - return 0; -} diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 94668e72ab09..29b3540dd180 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -5,6 +5,7 @@ #include <linux/kernel.h> #include <linux/blkdev.h> +#include <linux/build_bug.h> #include <linux/debugfs.h> #include "blk.h" @@ -79,33 +80,21 @@ static int queue_pm_only_show(void *data, struct seq_file *m) #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { - QUEUE_FLAG_NAME(STOPPED), QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), - QUEUE_FLAG_NAME(NONROT), - QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), - QUEUE_FLAG_NAME(ADD_RANDOM), - QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), - QUEUE_FLAG_NAME(STABLE_WRITES), - QUEUE_FLAG_NAME(POLL), - QUEUE_FLAG_NAME(WC), - QUEUE_FLAG_NAME(FUA), - QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), - QUEUE_FLAG_NAME(PCI_P2PDMA), - QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), - QUEUE_FLAG_NAME(NOWAIT), QUEUE_FLAG_NAME(SQ_SCHED), - QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), + QUEUE_FLAG_NAME(DISABLE_WBT_DEF), + QUEUE_FLAG_NAME(NO_ELV_SWITCH), }; #undef QUEUE_FLAG_NAME @@ -113,6 +102,7 @@ static int queue_state_show(void *data, struct seq_file *m) { struct request_queue *q = data; + BUILD_BUG_ON(ARRAY_SIZE(blk_queue_flag_name) != QUEUE_FLAG_MAX); blk_flags_show(m, q->queue_flags, blk_queue_flag_name, ARRAY_SIZE(blk_queue_flag_name)); seq_puts(m, "\n"); @@ -160,7 +150,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, - { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, + { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL }, { }, }; @@ -177,45 +167,32 @@ static int hctx_state_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; + BUILD_BUG_ON(ARRAY_SIZE(hctx_state_name) != BLK_MQ_S_MAX); blk_flags_show(m, hctx->state, hctx_state_name, ARRAY_SIZE(hctx_state_name)); seq_puts(m, "\n"); return 0; } -#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name -static const char *const alloc_policy_name[] = { - BLK_TAG_ALLOC_NAME(FIFO), - BLK_TAG_ALLOC_NAME(RR), -}; -#undef BLK_TAG_ALLOC_NAME - #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { - HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(TAG_QUEUE_SHARED), - HCTX_FLAG_NAME(BLOCKING), - HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), + HCTX_FLAG_NAME(BLOCKING), + HCTX_FLAG_NAME(TAG_RR), + HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); - seq_puts(m, "alloc_policy="); - if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && - alloc_policy_name[alloc_policy]) - seq_puts(m, alloc_policy_name[alloc_policy]); - else - seq_printf(m, "%d", alloc_policy); - seq_puts(m, " "); - blk_flags_show(m, - hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), - hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); + BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX)); + + blk_flags_show(m, hctx->flags, hctx_flag_name, + ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } @@ -236,12 +213,17 @@ static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(RAHEAD), CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), - CMD_FLAG_NAME(NOUNMAP), CMD_FLAG_NAME(POLLED), + CMD_FLAG_NAME(ALLOC_CACHE), + CMD_FLAG_NAME(SWAP), + CMD_FLAG_NAME(DRV), + CMD_FLAG_NAME(FS_PRIVATE), + CMD_FLAG_NAME(ATOMIC), + CMD_FLAG_NAME(NOUNMAP), }; #undef CMD_FLAG_NAME -#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name +#define RQF_NAME(name) [__RQF_##name] = #name static const char *const rqf_name[] = { RQF_NAME(STARTED), RQF_NAME(FLUSH_SEQ), @@ -256,7 +238,7 @@ static const char *const rqf_name[] = { RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), - RQF_NAME(ZONE_WRITE_LOCKED), + RQF_NAME(ZONE_WRITE_PLUGGING), RQF_NAME(TIMED_OUT), RQF_NAME(RESV), }; @@ -282,6 +264,9 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op); + BUILD_BUG_ON(ARRAY_SIZE(cmd_flag_name) != __REQ_NR_BITS); + BUILD_BUG_ON(ARRAY_SIZE(rqf_name) != __RQF_BITS); + seq_printf(m, "%p {.op=", rq); if (strcmp(op_str, "UNKNOWN") == 0) seq_printf(m, "%u", op); @@ -364,9 +349,14 @@ static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; + int res; + res = mutex_lock_interruptible(&hctx->queue->elevator_lock); + if (res) + return res; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, ¶ms); + mutex_unlock(&hctx->queue->elevator_lock); return 0; } @@ -417,15 +407,14 @@ static int hctx_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) @@ -434,15 +423,14 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_show(void *data, struct seq_file *m) @@ -451,15 +439,14 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) @@ -468,15 +455,14 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) - goto out; + return res; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); -out: - return res; + return 0; } static int hctx_active_show(void *data, struct seq_file *m) @@ -640,20 +626,9 @@ void blk_mq_debugfs_register(struct request_queue *q) debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); - /* - * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir - * didn't exist yet (because we don't know what to name the directory - * until the queue is registered to a gendisk). - */ - if (q->elevator && !q->sched_debugfs_dir) - blk_mq_debugfs_register_sched(q); - - /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) blk_mq_debugfs_register_hctx(q, hctx); - if (q->elevator && !hctx->sched_debugfs_dir) - blk_mq_debugfs_register_sched_hctx(q, hctx); } if (q->rq_qos) { diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 9c7d4b6117d4..c80e453e3014 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) } #endif -#ifdef CONFIG_BLK_DEBUG_FS_ZONED -int queue_zone_wlock_show(void *data, struct seq_file *m); +#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) +int queue_zone_wplugs_show(void *data, struct seq_file *m); #else -static inline int queue_zone_wlock_show(void *data, struct seq_file *m) +static inline int queue_zone_wplugs_show(void *data, struct seq_file *m) { return 0; } diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c new file mode 100644 index 000000000000..82bae475dfa4 --- /dev/null +++ b/block/blk-mq-dma.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Christoph Hellwig + */ +#include "blk.h" + +struct phys_vec { + phys_addr_t paddr; + u32 len; +}; + +static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, + struct phys_vec *vec) +{ + unsigned int max_size; + struct bio_vec bv; + + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + if (!iter->bio) + return false; + vec->paddr = bvec_phys(&req->special_vec); + vec->len = req->special_vec.bv_len; + iter->bio = NULL; + return true; + } + + if (!iter->iter.bi_size) + return false; + + bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + vec->paddr = bvec_phys(&bv); + max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); + bv.bv_len = min(bv.bv_len, max_size); + bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + + /* + * If we are entirely done with this bi_io_vec entry, check if the next + * one could be merged into it. This typically happens when moving to + * the next bio, but some callers also don't pack bvecs tight. + */ + while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { + struct bio_vec next; + + if (!iter->iter.bi_size) { + if (!iter->bio->bi_next) + break; + iter->bio = iter->bio->bi_next; + iter->iter = iter->bio->bi_iter; + } + + next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + if (bv.bv_len + next.bv_len > max_size || + !biovec_phys_mergeable(req->q, &bv, &next)) + break; + + bv.bv_len += next.bv_len; + bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + } + + vec->len = bv.bv_len; + return true; +} + +static inline struct scatterlist * +blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) +{ + if (!*sg) + return sglist; + + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); +} + +/* + * Map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries. + */ +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg) +{ + struct req_iterator iter = { + .bio = rq->bio, + }; + struct phys_vec vec; + int nsegs = 0; + + /* the internal flush request may not have bio attached */ + if (iter.bio) + iter.iter = iter.bio->bi_iter; + + while (blk_map_iter_next(rq, &iter, &vec)) { + *last_sg = blk_next_sg(last_sg, sglist); + sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + nsegs++; + } + + if (*last_sg) + sg_mark_end(*last_sg); + + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); + + return nsegs; +} +EXPORT_SYMBOL(__blk_rq_map_sg); diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c deleted file mode 100644 index d47b5c73c9eb..000000000000 --- a/block/blk-mq-pci.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2016 Christoph Hellwig. - */ -#include <linux/kobject.h> -#include <linux/blkdev.h> -#include <linux/blk-mq-pci.h> -#include <linux/pci.h> -#include <linux/module.h> - -#include "blk-mq.h" - -/** - * blk_mq_pci_map_queues - provide a default queue mapping for PCI device - * @qmap: CPU to hardware queue map. - * @pdev: PCI device associated with @set. - * @offset: Offset to use for the pci irq vector - * - * This function assumes the PCI device @pdev has at least as many available - * interrupt vectors as @set has queues. It will then query the vector - * corresponding to each queue for it's affinity mask and built queue mapping - * that maps a queue to the CPUs that have irq affinity for the corresponding - * vector. - */ -void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - for (queue = 0; queue < qmap->nr_queues; queue++) { - mask = pci_irq_get_affinity(pdev, queue + offset); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - qmap->mq_map[cpu] = qmap->queue_offset + queue; - } - - return; - -fallback: - WARN_ON_ONCE(qmap->nr_queues > 1); - blk_mq_clear_mq_map(qmap); -} -EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 451a2c1f1f32..55a0fd105147 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) list_first_entry(rq_list, struct request, queuelist)->mq_hctx; struct request *rq; LIST_HEAD(hctx_list); - unsigned int count = 0; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { list_cut_before(&hctx_list, rq_list, &rq->queuelist); goto dispatch; } - count++; } list_splice_tail_init(rq_list, &hctx_list); dispatch: - return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); + return blk_mq_dispatch_rq_list(hctx, &hctx_list, false); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ @@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) dispatched |= blk_mq_dispatch_hctx_list(&rq_list); } while (!list_empty(&rq_list)); } else { - dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false); } if (busy) @@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) + if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true)) return 0; need_dispatch = true; } else { @@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (need_dispatch) return blk_mq_do_dispatch_ctx(hctx); blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + blk_mq_dispatch_rq_list(hctx, &rq_list, true); return 0; } @@ -349,10 +347,9 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, } ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + hctx = blk_mq_map_queue(bio->bi_opf, ctx); type = hctx->type; - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || - list_empty_careful(&ctx->rq_lists[type])) + if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; /* default per sw-queue merge */ @@ -437,6 +434,30 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue) return 0; } +void blk_mq_sched_reg_debugfs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_sched(q); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); +} + +void blk_mq_sched_unreg_debugfs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&q->debugfs_mutex); + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_unregister_sched_hctx(hctx); + blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); +} + /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { @@ -470,10 +491,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (ret) goto err_free_map_and_rqs; - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched(q); - mutex_unlock(&q->debugfs_mutex); - queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); @@ -485,11 +502,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return ret; } } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_sched_hctx(q, hctx); - mutex_unlock(&q->debugfs_mutex); } - return 0; err_free_map_and_rqs: @@ -528,10 +541,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched_hctx(hctx); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; @@ -539,12 +548,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) flags = hctx->flags; } - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_sched(q); - mutex_unlock(&q->debugfs_mutex); - if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); + set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags); q->elevator = NULL; } diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 156e9bb07abf..24656980f443 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return res; } @@ -223,30 +223,27 @@ int blk_mq_sysfs_register(struct gendisk *disk) unsigned long i, j; int ret; - lockdep_assert_held(&q->sysfs_dir_lock); - ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq"); if (ret < 0) - goto out; + return ret; kobject_uevent(q->mq_kobj, KOBJ_ADD); + mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) - goto unreg; + goto out_unreg; } + mutex_unlock(&q->tag_set->tag_list_lock); + return 0; - q->mq_sysfs_init_done = true; - -out: - return ret; - -unreg: +out_unreg: queue_for_each_hw_ctx(q, hctx, j) { if (j < i) blk_mq_unregister_hctx(hctx); } + mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); @@ -259,15 +256,13 @@ void blk_mq_sysfs_unregister(struct gendisk *disk) struct blk_mq_hw_ctx *hctx; unsigned long i; - lockdep_assert_held(&q->sysfs_dir_lock); - + mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); + mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); - - q->mq_sysfs_init_done = false; } void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) @@ -275,15 +270,11 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - mutex_lock(&q->sysfs_dir_lock); - if (!q->mq_sysfs_init_done) - goto unlock; + if (!blk_queue_registered(q)) + return; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); - -unlock: - mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) @@ -292,9 +283,8 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) unsigned long i; int ret = 0; - mutex_lock(&q->sysfs_dir_lock); - if (!q->mq_sysfs_init_done) - goto unlock; + if (!blk_queue_registered(q)) + goto out; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -302,8 +292,6 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q) break; } -unlock: - mutex_unlock(&q->sysfs_dir_lock); - +out: return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index cc57e2dd9a0b..d880c50629d6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -38,6 +38,7 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; + unsigned long flags; struct blk_mq_tags *tags = hctx->tags; /* @@ -56,11 +57,11 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) return; } - spin_lock_irq(&tags->lock); + spin_lock_irqsave(&tags->lock, flags); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); - spin_unlock_irq(&tags->lock); + spin_unlock_irqrestore(&tags->lock, flags); } /* @@ -189,8 +190,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); - data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, - data->ctx); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; @@ -543,30 +543,11 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, - unsigned int queue_depth, unsigned int reserved, - int node, int alloc_policy) -{ - unsigned int depth = queue_depth - reserved; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - - if (bt_alloc(bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(breserved_tags, reserved, round_robin, node)) - goto free_bitmap_tags; - - return 0; - -free_bitmap_tags: - sbitmap_queue_free(bitmap_tags); - return -ENOMEM; -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, - unsigned int reserved_tags, - int node, int alloc_policy) + unsigned int reserved_tags, unsigned int flags, int node) { + unsigned int depth = total_tags - reserved_tags; + bool round_robin = flags & BLK_MQ_F_TAG_RR; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -581,14 +562,18 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) + goto out_free_tags; + if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) + goto out_free_bitmap_tags; - if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, - total_tags, reserved_tags, node, - alloc_policy) < 0) { - kfree(tags); - return NULL; - } return tags; + +out_free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +out_free_tags: + kfree(tags); + return NULL; } void blk_mq_free_tags(struct blk_mq_tags *tags) diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c deleted file mode 100644 index 68d0945c0b08..000000000000 --- a/block/blk-mq-virtio.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2016 Christoph Hellwig. - */ -#include <linux/device.h> -#include <linux/blk-mq-virtio.h> -#include <linux/virtio_config.h> -#include <linux/module.h> -#include "blk-mq.h" - -/** - * blk_mq_virtio_map_queues - provide a default queue mapping for virtio device - * @qmap: CPU to hardware queue map. - * @vdev: virtio device to provide a mapping for. - * @first_vec: first interrupt vectors to use for queues (usually 0) - * - * This function assumes the virtio device @vdev has at least as many available - * interrupt vectors as @set has queues. It will then query the vector - * corresponding to each queue for it's affinity mask and built queue mapping - * that maps a queue to the CPUs that have irq affinity for the corresponding - * vector. - */ -void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, - struct virtio_device *vdev, int first_vec) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - if (!vdev->config->get_vq_affinity) - goto fallback; - - for (queue = 0; queue < qmap->nr_queues; queue++) { - mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - qmap->mq_map[cpu] = qmap->queue_offset + queue; - } - - return; - -fallback: - blk_mq_map_queues(qmap); -} -EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 32afb87efbd0..4806b867e37d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -28,6 +28,7 @@ #include <linux/prefetch.h> #include <linux/blk-crypto.h> #include <linux/part_stat.h> +#include <linux/sched/isolation.h> #include <trace/events/block.h> @@ -42,6 +43,7 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); +static DEFINE_MUTEX(blk_mq_cpuhp_lock); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, @@ -87,41 +89,83 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct request *rq, void *priv) +static bool blk_mq_check_in_driver(struct request *rq, void *priv) { struct mq_inflight *mi = priv; - if (rq->part && blk_do_io_stat(rq) && - (!mi->part->bd_partno || rq->part == mi->part) && + if (rq->rq_flags & RQF_IO_STAT && + (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, - struct block_device *part) +void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver, + &mi); + inflight[READ] = mi.inflight[READ]; + inflight[WRITE] = mi.inflight[WRITE]; +} + +#ifdef CONFIG_LOCKDEP +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) +{ + if (!owner) + return false; - return mi.inflight[0] + mi.inflight[1]; + if (!q->mq_freeze_depth) { + q->mq_freeze_owner = owner; + q->mq_freeze_owner_depth = 1; + q->mq_freeze_disk_dead = !q->disk || + test_bit(GD_DEAD, &q->disk->state) || + !blk_queue_registered(q); + q->mq_freeze_queue_dying = blk_queue_dying(q); + return true; + } + + if (owner == q->mq_freeze_owner) + q->mq_freeze_owner_depth += 1; + return false; } -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, - unsigned int inflight[2]) +/* verify the last unfreeze in owner context */ +static bool blk_unfreeze_check_owner(struct request_queue *q) { - struct mq_inflight mi = { .part = part }; + if (q->mq_freeze_owner != current) + return false; + if (--q->mq_freeze_owner_depth == 0) { + q->mq_freeze_owner = NULL; + return true; + } + return false; +} - blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); - inflight[0] = mi.inflight[0]; - inflight[1] = mi.inflight[1]; +#else + +static bool blk_freeze_set_owner(struct request_queue *q, + struct task_struct *owner) +{ + return false; } -void blk_freeze_queue_start(struct request_queue *q) +static bool blk_unfreeze_check_owner(struct request_queue *q) { + return false; +} +#endif + +bool __blk_freeze_queue_start(struct request_queue *q, + struct task_struct *owner) +{ + bool freeze; + mutex_lock(&q->mq_freeze_lock); + freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); @@ -130,6 +174,14 @@ void blk_freeze_queue_start(struct request_queue *q) } else { mutex_unlock(&q->mq_freeze_lock); } + + return freeze; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + if (__blk_freeze_queue_start(q, current)) + blk_freeze_acquire_lock(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -148,35 +200,17 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); -/* - * Guarantee no request is in use, so we can change any data structure of - * the queue afterward. - */ -void blk_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue_nomemsave(struct request_queue *q) { - /* - * In the !blk_mq case we are only calling this to kill the - * q_usage_counter, otherwise this increases the freeze depth - * and waits for it to return to zero. For this reason there is - * no blk_unfreeze_queue(), and blk_freeze_queue() is not - * exported to drivers as the only user for unfreeze is blk_mq. - */ blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave); -void blk_mq_freeze_queue(struct request_queue *q) +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { - /* - * ...just an alias to keep freeze and unfreeze actions balanced - * in the blk_mq_* namespace - */ - blk_freeze_queue(q); -} -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); + bool unfreeze; -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) -{ mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; @@ -186,14 +220,38 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } + unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); + + return unfreeze; } -void blk_mq_unfreeze_queue(struct request_queue *q) +void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q) +{ + if (__blk_mq_unfreeze_queue(q, false)) + blk_unfreeze_release_lock(q); +} +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore); + +/* + * non_owner variant of blk_freeze_queue_start + * + * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen + * by the same task. This is fragile and should not be used if at all + * possible. + */ +void blk_freeze_queue_start_non_owner(struct request_queue *q) +{ + __blk_freeze_queue_start(q, NULL); +} +EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); + +/* non_owner variant of blk_mq_unfreeze_queue */ +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) { __blk_mq_unfreeze_queue(q, false); } -EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the @@ -282,8 +340,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } - blk_mq_wait_quiesce_done(set); mutex_unlock(&set->tag_list_lock); + + blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); @@ -322,7 +381,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = blk_time_get_ns(); - rq->part = NULL; blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); @@ -330,14 +388,9 @@ EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = blk_time_get_ns(); - else - rq->start_time_ns = 0; - #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) - rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; + rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif @@ -358,8 +411,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; - if (blk_queue_io_stat(q)) - data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { @@ -375,9 +426,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; -#if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; -#endif rq->end_io = NULL; rq->end_io_data = NULL; @@ -421,7 +470,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add(data->cached_rq, rq); + rq_list_add_head(data->cached_rqs, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) @@ -430,7 +479,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; - return rq_list_pop(data->cached_rq); + return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) @@ -447,6 +496,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; +retry: + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); + if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is @@ -468,13 +521,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } - } - -retry: - data->ctx = blk_mq_get_ctx(q); - data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->rq_flags & RQF_SCHED_TAGS)) + } else { blk_mq_tag_busy(data->hctx); + } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; @@ -525,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = plug->nr_ios, - .cached_rq = &plug->cached_rq, + .cached_rqs = &plug->cached_rqs, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -552,14 +605,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; - if (rq_list_empty(plug->cached_rq)) { + if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; @@ -568,8 +621,8 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; - plug->cached_rq = rq_list_next(rq); - blk_mq_rq_time_init(rq, 0); + rq_list_pop(&plug->cached_rqs); + blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; @@ -587,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; int ret; @@ -616,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, struct blk_mq_alloc_data data = { .q = q, .flags = flags, + .shallow_depth = 0, .cmd_flags = opf, + .rq_flags = 0, .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; u64 alloc_time_ns = 0; struct request *rq; @@ -690,6 +753,8 @@ static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; + blk_zone_finish_request(rq); + if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* @@ -743,7 +808,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; - while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) + while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } @@ -761,34 +826,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg) } EXPORT_SYMBOL(blk_dump_rq_flags); -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, blk_status_t error) -{ - if (unlikely(error)) { - bio->bi_status = error; - } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { - /* - * Partial zone append completions cannot be supported as the - * BIO fragments may end up not being written sequentially. - */ - if (bio->bi_iter.bi_size != nbytes) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_iter.bi_sector = rq->__sector; - } - - bio_advance(bio, nbytes); - - if (unlikely(rq->rq_flags & RQF_QUIET)) - bio_set_flag(bio, BIO_QUIET); - /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) - bio_endio(bio); -} - static void blk_account_io_completion(struct request *req, unsigned int bytes) { - if (req->part && blk_do_io_stat(req)) { + if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); @@ -808,7 +848,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* @@ -826,10 +866,8 @@ static void blk_complete_request(struct request *req) if (!bio) return; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) - req->q->integrity.profile->complete_fn(req, total_bytes); -#endif + blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -845,8 +883,7 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - if (req_op(req) == REQ_OP_ZONE_APPEND) - bio->bi_iter.bi_sector = req->__sector; + blk_zone_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); @@ -889,6 +926,8 @@ static void blk_complete_request(struct request *req) bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { + bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; + bool quiet = req->rq_flags & RQF_QUIET; int total_bytes; trace_block_rq_complete(req, error, nr_bytes); @@ -896,11 +935,9 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif + blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -909,9 +946,8 @@ bool blk_update_request(struct request *req, blk_status_t error, if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); - if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET)) && - !test_bit(GD_DEAD, &req->q->disk->state)) { + if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && + !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } @@ -923,12 +959,33 @@ bool blk_update_request(struct request *req, blk_status_t error, struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - if (bio_bytes == bio->bi_iter.bi_size) + if (unlikely(error)) + bio->bi_status = error; + + if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; + } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { + /* + * Partial zone append completions cannot be supported + * as the BIO fragments may end up not being written + * sequentially. + */ + bio->bi_status = BLK_STS_IOERR; + } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - req_bio_endio(req, bio, bio_bytes, error); + if (unlikely(quiet)) + bio_set_flag(bio, BIO_QUIET); + + bio_advance(bio, bio_bytes); + + /* Don't actually finish bio if it's part of flush sequence */ + if (!bio->bi_iter.bi_size) { + blk_zone_update_request_bio(req, bio); + if (!is_flush) + bio_endio(bio); + } total_bytes += bio_bytes; nr_bytes -= bio_bytes; @@ -989,38 +1046,76 @@ static inline void blk_account_io_done(struct request *req, u64 now) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && req->part && - !(req->rq_flags & RQF_FLUSH_SEQ)) { + if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } +static inline bool blk_rq_passthrough_stats(struct request *req) +{ + struct bio *bio = req->bio; + + if (!blk_queue_passthrough_stat(req->q)) + return false; + + /* Requests without a bio do not transfer data. */ + if (!bio) + return false; + + /* + * Stats are accumulated in the bdev, so must have one attached to a + * bio to track stats. Most drivers do not set the bdev for passthrough + * requests, but nvme is one that will set it. + */ + if (!bio->bi_bdev) + return false; + + /* + * We don't know what a passthrough command does, but we know the + * payload size and data direction. Ensuring the size is aligned to the + * block size filters out most commands with payloads that don't + * represent sector access. + */ + if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) + return false; + return true; +} + static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); - if (blk_do_io_stat(req)) { - /* - * All non-passthrough requests are created from a bio with one - * exception: when a flush command that is part of a flush sequence - * generated by the state machine in blk-flush.c is cloned onto the - * lower device by dm-multipath we can get here without a bio. - */ - if (req->bio) - req->part = req->bio->bi_bdev; - else - req->part = req->q->disk->part0; + if (!blk_queue_io_stat(req->q)) + return; + if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) + return; - part_stat_lock(); - update_io_ticks(req->part, jiffies, false); - part_stat_unlock(); - } + req->rq_flags |= RQF_IO_STAT; + req->start_time_ns = blk_time_get_ns(); + + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); + part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) @@ -1129,7 +1224,7 @@ static void blk_complete_reqs(struct llist_head *list) rq->q->mq_ops->complete(rq); } -static __latent_entropy void blk_done_softirq(struct softirq_action *h) +static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } @@ -1261,10 +1356,9 @@ void blk_mq_start_request(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) - q->integrity.profile->prepare_fn(rq); -#endif + blk_integrity_prepare(rq); + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } @@ -1304,8 +1398,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; - rq->rq_next = NULL; - rq_list_add(&plug->mq_list, rq); + rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } @@ -1330,11 +1423,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) blk_account_io_start(rq); - /* - * As plugging can be enabled for passthrough requests on a zoned - * device, directly accessing the plug instead of using blk_mq_plug() - * should not have any consequences. - */ if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; @@ -1462,19 +1550,17 @@ static void blk_mq_requeue_work(struct work_struct *work) while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); /* - * If RQF_DONTPREP ist set, the request has been started by the + * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ - if (rq->rq_flags & RQF_DONTPREP) { - list_del_init(&rq->queuelist); + if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); - } else { - list_del_init(&rq->queuelist); + else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); - } } while (!list_empty(&flush_list)) { @@ -1707,7 +1793,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } -EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; @@ -1921,19 +2006,6 @@ static void blk_mq_handle_dev_resource(struct request *rq, __blk_mq_requeue_request(rq); } -static void blk_mq_handle_zone_resource(struct request *rq, - struct list_head *zone_list) -{ - /* - * If we end up here it is because we cannot dispatch a request to a - * specific zone due to LLD level zone-write locking or other zone - * related resource not being available. In this case, set the request - * aside in zone_list for retrying it later. - */ - list_add(&rq->queuelist, zone_list); - __blk_mq_requeue_request(rq); -} - enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, @@ -2012,14 +2084,13 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, - unsigned int nr_budgets) + bool get_budget) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; - LIST_HEAD(zone_list); bool needs_resource = false; if (list_empty(list)) @@ -2035,7 +2106,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); - prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + prep = blk_mq_prep_dispatch_rq(rq, get_budget); if (prep != PREP_DISPATCH_OK) break; @@ -2044,12 +2115,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bd.rq = rq; bd.last = list_empty(list); - /* - * once the request is queued to lld, no need to cover the - * budget any more - */ - if (nr_budgets) - nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: @@ -2061,23 +2126,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; - case BLK_STS_ZONE_RESOURCE: - /* - * Move the request to zone_list and keep going through - * the dispatch list to find more requests the drive can - * accept. - */ - blk_mq_handle_zone_resource(rq, &zone_list); - needs_resource = true; - break; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: - if (!list_empty(&zone_list)) - list_splice_tail_init(&zone_list, list); - /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ @@ -2095,7 +2148,11 @@ out: ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); - if (nr_budgets) + /* + * If the caller allocated budgets, free the budgets of the + * requests that have not yet been passed to the block driver. + */ + if (!get_budget) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); @@ -2164,6 +2221,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) } /* + * ->next_cpu is always calculated from hctx->cpumask, so simply use + * it for speeding up the check + */ +static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) +{ + return hctx->next_cpu >= nr_cpu_ids; +} + +/* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every @@ -2174,7 +2240,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) bool tried = false; int next_cpu = hctx->next_cpu; - if (hctx->queue->nr_hw_queues == 1) + /* Switch to unbound if no allowable CPUs in this hctx */ + if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { @@ -2225,6 +2292,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); +static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) +{ + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); + return need_run; +} + /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. @@ -2245,20 +2330,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); - /* - * When queue is quiesced, we may be switching io scheduler, or - * updating nr_hw_queues, or other things, and we can't run queue - * any more, even __blk_mq_hctx_has_pending() can't be called safely. - * - * And queue will be rerun in blk_mq_unquiesce_queue() if it is - * quiesced. - */ - __blk_mq_run_dispatch_ops(hctx->queue, false, - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx)); + need_run = blk_mq_hw_queue_need_run(hctx); + if (!need_run) { + unsigned long flags; - if (!need_run) - return; + /* + * Synchronize with blk_mq_unquiesce_queue(), because we check + * if hw queue is quiesced locklessly above, we need the use + * ->queue_lock to make sure we see the up-to-date status to + * not miss rerunning the hw queue. + */ + spin_lock_irqsave(&hctx->queue->queue_lock, flags); + need_run = blk_mq_hw_queue_need_run(hctx); + spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); + + if (!need_run) + return; + } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); @@ -2415,6 +2503,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + /* + * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the + * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch + * list in the subsequent routine. + */ + smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); @@ -2566,9 +2660,13 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; + rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; - rq->write_hint = bio->bi_write_hint; - blk_rq_bio_prep(rq, bio, nr_segs); + rq->__data_len = bio->bi_iter.bi_size; + rq->nr_phys_segments = nr_segs; + if (bio_integrity(bio)) + rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, + bio); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); @@ -2642,6 +2740,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return; } @@ -2672,6 +2771,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } @@ -2680,15 +2780,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) return __blk_mq_issue_directly(hctx, rq, last); } -static void blk_mq_plug_issue_direct(struct blk_plug *plug) +static void blk_mq_issue_direct(struct rq_list *rqs) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; - while ((rq = rq_list_pop(&plug->mq_list))) { - bool last = rq_list_empty(plug->mq_list); + while ((rq = rq_list_pop(rqs))) { + bool last = rq_list_empty(rqs); if (hctx != rq->mq_hctx) { if (hctx) { @@ -2719,26 +2819,74 @@ out: blk_mq_commit_rqs(hctx, queued, false); } -static void __blk_mq_flush_plug_list(struct request_queue *q, - struct blk_plug *plug) +static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs) { if (blk_queue_quiesced(q)) return; - q->mq_ops->queue_rqs(&plug->mq_list); + q->mq_ops->queue_rqs(rqs); +} + +static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs, + struct rq_list *queue_rqs) +{ + struct request *rq = rq_list_pop(rqs); + struct request_queue *this_q = rq->q; + struct request **prev = &rqs->head; + struct rq_list matched_rqs = {}; + struct request *last = NULL; + unsigned depth = 1; + + rq_list_add_tail(&matched_rqs, rq); + while ((rq = *prev)) { + if (rq->q == this_q) { + /* move rq from rqs to matched_rqs */ + *prev = rq->rq_next; + rq_list_add_tail(&matched_rqs, rq); + depth++; + } else { + /* leave rq in rqs */ + prev = &rq->rq_next; + last = rq; + } + } + + rqs->tail = last; + *queue_rqs = matched_rqs; + return depth; } -static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth) +{ + struct request_queue *q = rq_list_peek(rqs)->q; + + trace_block_unplug(q, depth, true); + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole list in one go. + * We already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + */ + if (q->mq_ops->queue_rqs) { + blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); + if (rq_list_empty(rqs)) + return; + } + + blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs)); +} + +static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; - struct request *requeue_list = NULL; - struct request **requeue_lastp = &requeue_list; + struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); do { - struct request *rq = rq_list_pop(&plug->mq_list); + struct request *rq = rq_list_pop(rqs); if (!this_hctx) { this_hctx = rq->mq_hctx; @@ -2746,14 +2894,14 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { - rq_list_add_tail(&requeue_lastp, rq); + rq_list_add_tail(&requeue_list, rq); continue; } - list_add(&rq->queuelist, &list); + list_add_tail(&rq->queuelist, &list); depth++; - } while (!rq_list_empty(plug->mq_list)); + } while (!rq_list_empty(rqs)); - plug->mq_list = requeue_list; + *rqs = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); @@ -2773,9 +2921,22 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) percpu_ref_put(&this_hctx->queue->q_usage_counter); } +static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs) +{ + do { + struct rq_list queue_rqs; + unsigned depth; + + depth = blk_mq_extract_queue_requests(rqs, &queue_rqs); + blk_mq_dispatch_queue_requests(&queue_rqs, depth); + while (!rq_list_empty(&queue_rqs)) + blk_mq_dispatch_list(&queue_rqs, false); + } while (!rq_list_empty(rqs)); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct request *rq; + unsigned int depth; /* * We may have been called recursively midway through handling @@ -2786,36 +2947,23 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ if (plug->rq_count == 0) return; + depth = plug->rq_count; plug->rq_count = 0; - if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { - struct request_queue *q; - - rq = rq_list_peek(&plug->mq_list); - q = rq->q; - - /* - * Peek first request and see if we have a ->queue_rqs() hook. - * If we do, we can dispatch the whole plug list in one go. We - * already know at this point that all requests belong to the - * same queue, caller must ensure that's the case. - */ - if (q->mq_ops->queue_rqs) { - blk_mq_run_dispatch_ops(q, - __blk_mq_flush_plug_list(q, plug)); - if (rq_list_empty(plug->mq_list)) - return; + if (!plug->has_elevator && !from_schedule) { + if (plug->multiple_queues) { + blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); + return; } - blk_mq_run_dispatch_ops(q, - blk_mq_plug_issue_direct(plug)); - if (rq_list_empty(plug->mq_list)) + blk_mq_dispatch_queue_requests(&plug->mq_list, depth); + if (rq_list_empty(&plug->mq_list)) return; } do { - blk_mq_dispatch_plug_list(plug, from_schedule); - } while (!rq_list_empty(plug->mq_list)); + blk_mq_dispatch_list(&plug->mq_list, from_schedule); + } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2865,13 +3013,18 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q, static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, - struct bio *bio, - unsigned int nsegs) + struct bio *bio) { struct blk_mq_alloc_data data = { .q = q, - .nr_tags = 1, + .flags = 0, + .shallow_depth = 0, .cmd_flags = bio->bi_opf, + .rq_flags = 0, + .nr_tags = 1, + .cached_rqs = NULL, + .ctx = NULL, + .hctx = NULL }; struct request *rq; @@ -2880,16 +3033,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; - data.cached_rq = &plug->cached_rq; + data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); - if (rq) - return rq; - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - return NULL; + if (unlikely(!rq)) + rq_qos_cleanup(q, bio); + return rq; } /* @@ -2903,7 +3053,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, if (!plug) return NULL; - rq = rq_list_peek(&plug->cached_rq); + rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && @@ -2917,21 +3067,32 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (rq_list_pop(&plug->cached_rqs) != rq) + WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ - plug->cached_rq = rq_list_next(rq); rq_qos_throttle(rq->q, bio); - blk_mq_rq_time_init(rq, 0); + blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } +static bool bio_unaligned(const struct bio *bio, struct request_queue *q) +{ + unsigned int bs_mask = queue_logical_block_size(q) - 1; + + /* .bi_sector of any zero sized bio need to be initialized */ + if ((bio->bi_iter.bi_size & bs_mask) || + ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) + return true; + return false; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2948,44 +3109,79 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct blk_plug *plug = blk_mq_plug(bio); + struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - unsigned int nr_segs = 1; + unsigned int nr_segs; struct request *rq; blk_status_t ret; - bio = blk_queue_bounce(bio, q); + /* + * If the plug has a cached request for this queue, try to use it. + */ + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); + + /* + * A BIO that was released from a zone write plug has already been + * through the preparation in this function, already holds a reference + * on the queue usage counter, and is the only write BIO in-flight for + * the target zone. Go straight to preparing a request for it. + */ + if (bio_zone_write_plugging(bio)) { + nr_segs = bio->__bi_nr_segments; + if (rq) + blk_queue_exit(q); + goto new_request; + } /* - * If the plug has a cached request for this queue, try use it. - * * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. */ - rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); if (!rq) { if (unlikely(bio_queue_enter(bio))) return; } - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto queue_exit; + /* + * Device reconfiguration may change logical block size or reduce the + * number of poll queues, so the checks for alignment and poll support + * have to be done with queue usage counter held. + */ + if (unlikely(bio_unaligned(bio, q))) { + bio_io_error(bio); + goto queue_exit; } + + if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + goto queue_exit; + } + + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + if (!bio_integrity_prep(bio)) goto queue_exit; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - if (!rq) { - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) - goto queue_exit; - } else { + if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + +new_request: + if (rq) { blk_mq_use_cached_rq(rq, plug, bio); + } else { + rq = blk_mq_get_new_requests(q, plug, bio); + if (unlikely(!rq)) { + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + goto queue_exit; + } } trace_block_getrq(bio); @@ -3002,6 +3198,9 @@ void blk_mq_submit_bio(struct bio *bio) return; } + if (bio_zone_write_plugging(bio)) + blk_zone_write_plug_init_request(rq); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; @@ -3037,7 +3236,7 @@ queue_exit: blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; @@ -3134,19 +3333,21 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { - struct bio *bio, *bio_src; + struct bio *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { - bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, - bs); + struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, + gfp_mask, bs); if (!bio) goto free_and_out; - if (bio_ctr && bio_ctr(bio, bio_src, data)) + if (bio_ctr && bio_ctr(bio, bio_src, data)) { + bio_put(bio); goto free_and_out; + } if (rq->bio) { rq->biotail->bi_next = bio; @@ -3154,7 +3355,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } else { rq->bio = rq->biotail = bio; } - bio = NULL; } /* Copy attributes of the original request to the clone request. */ @@ -3165,8 +3365,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - rq->write_hint = rq_src->write_hint; + rq->nr_integrity_segments = rq_src->nr_integrity_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -3174,8 +3373,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, return 0; free_and_out: - if (bio) - bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; @@ -3338,8 +3535,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; @@ -3483,14 +3679,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) return data.has_rq; } -static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, - struct blk_mq_hw_ctx *hctx) +static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, + unsigned int this_cpu) { - if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) - return false; - if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) - return false; - return true; + enum hctx_type type = hctx->type; + int cpu; + + /* + * hctx->cpumask has to rule out isolated CPUs, but userspace still + * might submit IOs on these isolated CPUs, so use the queue map to + * check if all CPUs mapped to this hctx are offline + */ + for_each_online_cpu(cpu) { + struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, + type, cpu); + + if (h != hctx) + continue; + + /* this hctx has at least one online CPU */ + if (this_cpu != cpu) + return true; + } + + return false; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) @@ -3498,8 +3710,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); - if (!cpumask_test_cpu(cpu, hctx->cpumask) || - !blk_mq_last_cpu_in_hctx(cpu, hctx)) + if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; /* @@ -3526,12 +3737,28 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) return 0; } +/* + * Check if one CPU is mapped to the specified hctx + * + * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed + * to be used for scheduling kworker only. For other usage, please call this + * helper for checking if one CPU belongs to the specified hctx + */ +static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu, + const struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue, + hctx->type, cpu); + + return mapped_hctx == hctx; +} + static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); - if (cpumask_test_cpu(cpu, hctx->cpumask)) + if (blk_mq_cpu_mapped_to_hctx(cpu, hctx)) clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } @@ -3549,7 +3776,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); - if (!cpumask_test_cpu(cpu, hctx->cpumask)) + if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx)) return 0; ctx = __blk_mq_get_ctx(hctx->queue, cpu); @@ -3573,13 +3800,91 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) return 0; } -static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) +static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { - if (!(hctx->flags & BLK_MQ_F_STACKING)) + lockdep_assert_held(&blk_mq_cpuhp_lock); + + if (!(hctx->flags & BLK_MQ_F_STACKING) && + !hlist_unhashed(&hctx->cpuhp_online)) { cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); - cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, - &hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_online); + } + + if (!hlist_unhashed(&hctx->cpuhp_dead)) { + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_dead); + } +} + +static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) +{ + mutex_lock(&blk_mq_cpuhp_lock); + __blk_mq_remove_cpuhp(hctx); + mutex_unlock(&blk_mq_cpuhp_lock); +} + +static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) +{ + lockdep_assert_held(&blk_mq_cpuhp_lock); + + if (!(hctx->flags & BLK_MQ_F_STACKING) && + hlist_unhashed(&hctx->cpuhp_online)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); + + if (hlist_unhashed(&hctx->cpuhp_dead)) + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); +} + +static void __blk_mq_remove_cpuhp_list(struct list_head *head) +{ + struct blk_mq_hw_ctx *hctx; + + lockdep_assert_held(&blk_mq_cpuhp_lock); + + list_for_each_entry(hctx, head, hctx_list) + __blk_mq_remove_cpuhp(hctx); +} + +/* + * Unregister cpuhp callbacks from exited hw queues + * + * Safe to call if this `request_queue` is live + */ +static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) +{ + LIST_HEAD(hctx_list); + + spin_lock(&q->unused_hctx_lock); + list_splice_init(&q->unused_hctx_list, &hctx_list); + spin_unlock(&q->unused_hctx_lock); + + mutex_lock(&blk_mq_cpuhp_lock); + __blk_mq_remove_cpuhp_list(&hctx_list); + mutex_unlock(&blk_mq_cpuhp_lock); + + spin_lock(&q->unused_hctx_lock); + list_splice(&hctx_list, &q->unused_hctx_list); + spin_unlock(&q->unused_hctx_lock); +} + +/* + * Register cpuhp callbacks from all hw queues + * + * Safe to call if this `request_queue` is live + */ +static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&blk_mq_cpuhp_lock); + queue_for_each_hw_ctx(q, hctx, i) + __blk_mq_add_cpuhp(hctx); + mutex_unlock(&blk_mq_cpuhp_lock); } /* @@ -3630,8 +3935,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - blk_mq_remove_cpuhp(hctx); - xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); @@ -3648,6 +3951,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; + blk_mq_remove_cpuhp(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } @@ -3658,16 +3962,11 @@ static int blk_mq_init_hctx(struct request_queue *q, { hctx->queue_num = hctx_idx; - if (!(hctx->flags & BLK_MQ_F_STACKING)) - cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, - &hctx->cpuhp_online); - cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); - hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) - goto unregister_cpu_notifier; + goto fail; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) @@ -3684,8 +3983,7 @@ static int blk_mq_init_hctx(struct request_queue *q, exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - unregister_cpu_notifier: - blk_mq_remove_cpuhp(hctx); + fail: return -1; } @@ -3711,6 +4009,8 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); + INIT_HLIST_NODE(&hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_online); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; @@ -3907,6 +4207,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) } queue_for_each_hw_ctx(q, hctx, i) { + int cpu; + /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -3934,6 +4236,15 @@ static void blk_mq_map_swqueue(struct request_queue *q) sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* + * Rule out isolated CPUs from hctx->cpumask to avoid + * running block kworker on isolated CPUs + */ + for_each_cpu(cpu, hctx->cpumask) { + if (cpu_is_isolated(cpu)) + cpumask_clear_cpu(cpu, hctx->cpumask); + } + + /* * Initialize batch roundrobin counts */ hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); @@ -3964,13 +4275,14 @@ static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; + unsigned int memflags; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } } @@ -4075,7 +4387,13 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); + if (!lim) + lim = &default_lim; + lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; + if (set->nr_maps > HCTX_TYPE_POLL) + lim->features |= BLK_FEAT_POLL; + + q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; @@ -4151,6 +4469,15 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); +/* + * Only hctx removed from cpuhp list can be reused + */ +static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) +{ + return hlist_unhashed(&hctx->cpuhp_online) && + hlist_unhashed(&hctx->cpuhp_dead); +} + static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) @@ -4160,7 +4487,7 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { - if (tmp->numa_node == node) { + if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } @@ -4185,14 +4512,12 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( return NULL; } -static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, - struct request_queue *q) +static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i, j; - /* protect against switching io scheduler */ - mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4225,18 +4550,18 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->sysfs_lock); } -static void blk_mq_update_poll_flag(struct request_queue *q) +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; + __blk_mq_realloc_hw_ctxs(set, q); - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + /* unregister cpuhp callbacks for exited hctxs */ + blk_mq_remove_hw_queues_cpuhp(q); + + /* register cpuhp for new initialized hctxs */ + blk_mq_add_hw_queues_cpuhp(q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, @@ -4245,6 +4570,12 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + /* + * ->tag_set has to be setup before initialize hctx, which cpuphp + * handler needs it for checking queue mapping + */ + q->tag_set = set; + if (blk_mq_alloc_ctxs(q)) goto err_exit; @@ -4263,10 +4594,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); - q->tag_set = set; - q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); @@ -4276,8 +4604,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); - blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); + blk_mq_add_queue_tag_set(set, q); return 0; err_hctxs: @@ -4497,6 +4825,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_free_srcu; } + init_rwsem(&set->update_nr_hwq_lock); + ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, @@ -4590,13 +4920,15 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) int ret; unsigned long i; + if (WARN_ON_ONCE(!q->mq_freeze_depth)) + return -EINVAL; + if (!set) return -EINVAL; if (q->nr_requests == nr) return 0; - blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); ret = 0; @@ -4630,95 +4962,16 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); return ret; } -/* - * request_queue and elevator_type pair. - * It is just used by __blk_mq_update_nr_hw_queues to cache - * the elevator_type associated with a request_queue. - */ -struct blk_mq_qe_pair { - struct list_head node; - struct request_queue *q; - struct elevator_type *type; -}; - -/* - * Cache the elevator_type in qe pair list and switch the - * io scheduler to 'none' - */ -static bool blk_mq_elv_switch_none(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - - qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); - if (!qe) - return false; - - /* q->elevator needs protection from ->sysfs_lock */ - mutex_lock(&q->sysfs_lock); - - /* the check has to be done with holding sysfs_lock */ - if (!q->elevator) { - kfree(qe); - goto unlock; - } - - INIT_LIST_HEAD(&qe->node); - qe->q = q; - qe->type = q->elevator->type; - /* keep a reference to the elevator module as we'll switch back */ - __elevator_get(qe->type); - list_add(&qe->node, head); - elevator_disable(q); -unlock: - mutex_unlock(&q->sysfs_lock); - - return true; -} - -static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - - list_for_each_entry(qe, head, node) - if (qe->q == q) - return qe; - - return NULL; -} - -static void blk_mq_elv_switch_back(struct list_head *head, - struct request_queue *q) -{ - struct blk_mq_qe_pair *qe; - struct elevator_type *t; - - qe = blk_lookup_qe_pair(head, q); - if (!qe) - return; - t = qe->type; - list_del(&qe->node); - kfree(qe); - - mutex_lock(&q->sysfs_lock); - elevator_switch(q, t); - /* drop the reference acquired in blk_mq_elv_switch_none */ - elevator_put(t); - mutex_unlock(&q->sysfs_lock); -} - static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; - LIST_HEAD(head); int prev_nr_hw_queues = set->nr_hw_queues; + unsigned int memflags; int i; lockdep_assert_held(&set->tag_list_lock); @@ -4730,30 +4983,26 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_freeze_queue(q); - /* - * Switch IO scheduler to 'none', cleaning up the data associated - * with the previous scheduler. We will switch back once we are done - * updating the new sw to hw queue mappings. - */ - list_for_each_entry(q, &set->tag_list, tag_set_list) - if (!blk_mq_elv_switch_none(&head, q)) - goto switch_back; - + memflags = memalloc_noio_save(); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); } - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue_nomemsave(q); + + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unfreeze_queue_nomemrestore(q); goto reregister; + } fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_realloc_hw_ctxs(set, q); - blk_mq_update_poll_flag(q); + __blk_mq_realloc_hw_ctxs(set, q); + if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; @@ -4768,18 +5017,19 @@ fallback: blk_mq_map_swqueue(q); } + /* elv_update_nr_hw_queues() unfreeze queue for us */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + elv_update_nr_hw_queues(q); + reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); - } -switch_back: - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_elv_switch_back(&head, q); - - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue(q); + blk_mq_remove_hw_queues_cpuhp(q); + blk_mq_add_hw_queues_cpuhp(q); + } + memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) @@ -4788,9 +5038,11 @@ switch_back: void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { + down_write(&set->update_nr_hwq_lock); mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); + up_write(&set->update_nr_hwq_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); @@ -4824,9 +5076,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { - struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); - - return blk_hctx_poll(q, hctx, iob, flags); + if (!blk_mq_can_poll(q)) + return 0; + return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, diff --git a/block/blk-mq.h b/block/blk-mq.h index f75a9ecfebde..affb2e14b56e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -36,6 +36,8 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; +#define BLK_MQ_CPU_WORK_BATCH (8) + typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) @@ -46,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, - unsigned int); + bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); @@ -98,12 +100,10 @@ static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue - * @q: request queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ -static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - blk_opf_t opf, +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; @@ -153,7 +153,7 @@ struct blk_mq_alloc_data { /* allocate multiple requests/tags in one go */ unsigned int nr_tags; - struct request **cached_rq; + struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -161,11 +161,8 @@ struct blk_mq_alloc_data { }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, - unsigned int reserved_tags, int node, int alloc_policy); + unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tags *tags); -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, unsigned int queue_depth, - unsigned int reserved, int node, int alloc_policy); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, @@ -228,6 +225,19 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { + /* Fast path: hardware queue is not stopped most of the time. */ + if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + return false; + + /* + * This barrier is used to order adding of dispatch list before and + * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier + * in blk_mq_start_stopped_hw_queue() so that dispatch code could + * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not + * empty to avoid missing dispatching requests. + */ + smp_mb(); + return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } @@ -236,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, - struct block_device *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, - unsigned int inflight[2]); +void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) @@ -365,37 +372,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) qmap->mq_map[cpu] = 0; } -/* - * blk_mq_plug() - Get caller context plug - * @bio : the bio being submitted by the caller context - * - * Plugging, by design, may delay the insertion of BIOs into the elevator in - * order to increase BIO merging opportunities. This however can cause BIO - * insertion order to change from the order in which submit_bio() is being - * executed in the case of multiple contexts concurrently issuing BIOs to a - * device, even if these context are synchronized to tightly control BIO issuing - * order. While this is not a problem with regular block devices, this ordering - * change can cause write BIO failures with zoned block devices as these - * require sequential write patterns to zones. Prevent this from happening by - * ignoring the plug state of a BIO issuing context if it is for a zoned block - * device and the BIO to plug is a write operation. - * - * Return current->plug if the bio can be plugged and NULL otherwise - */ -static inline struct blk_plug *blk_mq_plug( struct bio *bio) -{ - /* Zoned block device write operation case: do not plug the BIO */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) - return NULL; - - /* - * For regular block devices or read operations, use the context plug - * which may be NULL if blk_start_plug() was not executed. - */ - return current->plug; -} - /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { @@ -467,4 +443,10 @@ do { \ #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ +static inline bool blk_mq_can_poll(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_POLL) && + q->tag_set->map[HCTX_TYPE_POLL].nr_queues; +} + #endif diff --git a/block/blk-pm.c b/block/blk-pm.c index 42e842074715..8d3e052f91da 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -89,7 +89,7 @@ int blk_pre_runtime_suspend(struct request_queue *q) if (percpu_ref_is_zero(&q->q_usage_counter)) ret = 0; /* Switch q_usage_counter back to per-cpu mode. */ - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue_nomemrestore(q); if (ret < 0) { spin_lock_irq(&q->queue_lock); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index dd7310c94713..848591fb3c57 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -2,6 +2,8 @@ #include "blk-rq-qos.h" +__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos); + /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. @@ -196,7 +198,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) struct rq_qos_wait_data { struct wait_queue_entry wq; - struct task_struct *task; struct rq_wait *rqw; acquire_inflight_cb_t *cb; void *private_data; @@ -218,9 +219,21 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; - smp_wmb(); - list_del_init(&curr->entry); - wake_up_process(data->task); + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). + */ + default_wake_function(curr, mode, wake_flags, key); + /* + * Note that the order of operations is important as finish_wait() + * tests whether @curr is removed without grabbing the lock. This + * should be the last thing to do to make sure we will not have a + * UAF access to @data. And the semantics of memory barrier in it + * also make sure the waiter will see the latest @data->got_token + * once list_empty_careful() in finish_wait() returns true. + */ + list_del_init_careful(&curr->entry); return 1; } @@ -245,42 +258,55 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, cleanup_cb_t *cleanup_cb) { struct rq_qos_wait_data data = { - .wq = { - .func = rq_qos_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, - .rqw = rqw, - .cb = acquire_inflight_cb, - .private_data = private_data, + .rqw = rqw, + .cb = acquire_inflight_cb, + .private_data = private_data, + .got_token = false, }; - bool has_sleeper; + bool first_waiter; - has_sleeper = wq_has_sleeper(&rqw->wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + /* + * If there are no waiters in the waiting queue, try to increase the + * inflight counter if we can. Otherwise, prepare for adding ourselves + * to the waiting queue. + */ + if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data)) return; - has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + init_wait_func(&data.wq, rq_qos_wake_function); + first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + /* + * Make sure there is at least one inflight process; otherwise, waiters + * will never be woken up. Since there may be no inflight process before + * adding ourselves to the waiting queue above, we need to try to + * increase the inflight counter for ourselves. And it is sufficient to + * guarantee that at least the first waiter to enter the waiting queue + * will re-check the waiting condition before going to sleep, thus + * ensuring forward progress. + */ + if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + /* + * We raced with rq_qos_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + * + * Enough memory barrier in list_empty_careful() in + * finish_wait() is paired with list_del_init_careful() + * in rq_qos_wake_function() to make sure we will see + * the latest @data->got_token. + */ + if (data.got_token) + cleanup_cb(rqw, private_data); + return; + } + + /* we are now relying on the waker to increase our inflight counter. */ do { - /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { - finish_wait(&rqw->wait, &data.wq); - - /* - * We raced with rq_qos_wake_function() getting a token, - * which means we now have two. Put our local token - * and wake anyone else potentially waiting for one. - */ - smp_rmb(); - if (data.got_token) - cleanup_cb(rqw, private_data); - break; - } io_schedule(); - has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); @@ -293,6 +319,7 @@ void rq_qos_exit(struct request_queue *q) struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); + static_branch_dec(&block_rq_qos); } mutex_unlock(&q->rq_qos_mutex); } @@ -301,6 +328,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, const struct rq_qos_ops *ops) { struct request_queue *q = disk->queue; + unsigned int memflags; lockdep_assert_held(&q->rq_qos_mutex); @@ -312,14 +340,15 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, * No IO can be in-flight when adding rqos, so freeze queue, which * is fine since we only support rq_qos for blk-mq queue. */ - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); if (rq_qos_id(q, rqos->id)) goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; + static_branch_inc(&block_rq_qos); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); if (rqos->ops->debugfs_attrs) { mutex_lock(&q->debugfs_mutex); @@ -329,7 +358,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, return 0; ebusy: - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); return -EBUSY; } @@ -337,17 +366,18 @@ void rq_qos_del(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; struct rq_qos **cur; + unsigned int memflags; lockdep_assert_held(&q->rq_qos_mutex); - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_rqos(rqos); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 37245c97ee61..39749f4066fb 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -12,6 +12,7 @@ #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; +extern struct static_key_false block_rq_qos; enum rq_qos_id { RQ_QOS_WBT, @@ -112,31 +113,33 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos && !blk_rq_is_passthrough(rq)) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos && + !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { - if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || + if (static_branch_unlikely(&block_rq_qos) && + bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || bio_flagged(bio, BIO_QOS_MERGED))) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (q->rq_qos) @@ -146,7 +149,7 @@ static inline void rq_qos_done_bio(struct bio *bio) static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - if (q->rq_qos) { + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } @@ -155,14 +158,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) { + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } @@ -170,7 +173,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq, static inline void rq_qos_queue_depth_changed(struct request_queue *q) { - if (q->rq_qos) + if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } diff --git a/block/blk-settings.c b/block/blk-settings.c index d2731843f2fc..a000daafbfb4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -6,7 +6,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/pagemap.h> #include <linux/backing-dev-defs.h> #include <linux/gcd.h> @@ -21,7 +21,7 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { - q->rq_timeout = timeout; + WRITE_ONCE(q->rq_timeout, timeout); } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); @@ -50,25 +50,31 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; - lim->max_zone_append_sectors = UINT_MAX; + lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); -static void blk_apply_bdi_limits(struct backing_dev_info *bdi, +void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim) { /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. + * + * There is no hardware limitation for the read-ahead size and the user + * might have increased the read-ahead size through sysfs, so don't ever + * decrease it. */ - bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + bdi->ra_pages = max3(bdi->ra_pages, + lim->io_opt * 2 / PAGE_SIZE, + VM_READAHEAD_PAGES); bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } static int blk_validate_zoned_limits(struct queue_limits *lim) { - if (!lim->zoned) { + if (!(lim->features & BLK_FEAT_ZONED)) { if (WARN_ON_ONCE(lim->max_open_zones) || WARN_ON_ONCE(lim->max_active_zones) || WARN_ON_ONCE(lim->zone_write_granularity) || @@ -80,30 +86,175 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) return -EINVAL; + /* + * Given that active zones include open zones, the maximum number of + * open zones cannot be larger than the maximum number of active zones. + */ + if (lim->max_active_zones && + lim->max_open_zones > lim->max_active_zones) + return -EINVAL; + if (lim->zone_write_granularity < lim->logical_block_size) lim->zone_write_granularity = lim->logical_block_size; - if (lim->max_zone_append_sectors) { + /* + * The Zone Append size is limited by the maximum I/O size and the zone + * size given that it can't span zones. + * + * If no max_hw_zone_append_sectors limit is provided, the block layer + * will emulated it, else we're also bound by the hardware limit. + */ + lim->max_zone_append_sectors = + min_not_zero(lim->max_hw_zone_append_sectors, + min(lim->chunk_sectors, lim->max_hw_sectors)); + return 0; +} + +static int blk_validate_integrity_limits(struct queue_limits *lim) +{ + struct blk_integrity *bi = &lim->integrity; + + if (!bi->tuple_size) { + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || + bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { + pr_warn("invalid PI settings.\n"); + return -EINVAL; + } + bi->flags |= BLK_INTEGRITY_NOGENERATE | BLK_INTEGRITY_NOVERIFY; + return 0; + } + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { + pr_warn("integrity support disabled.\n"); + return -EINVAL; + } + + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE && + (bi->flags & BLK_INTEGRITY_REF_TAG)) { + pr_warn("ref tag not support without checksum.\n"); + return -EINVAL; + } + + if (!bi->interval_exp) + bi->interval_exp = ilog2(lim->logical_block_size); + + return 0; +} + +/* + * Returns max guaranteed bytes which we can fit in a bio. + * + * We request that an atomic_write is ITER_UBUF iov_iter (so a single vector), + * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from + * the first and last segments. + */ +static unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim) +{ + unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments); + unsigned int length; + + length = min(max_segments, 2) * lim->logical_block_size; + if (max_segments > 2) + length += (max_segments - 2) * PAGE_SIZE; + + return length; +} + +static void blk_atomic_writes_update_limits(struct queue_limits *lim) +{ + unsigned int unit_limit = min(lim->max_hw_sectors << SECTOR_SHIFT, + blk_queue_max_guaranteed_bio(lim)); + + unit_limit = rounddown_pow_of_two(unit_limit); + + lim->atomic_write_max_sectors = + min(lim->atomic_write_hw_max >> SECTOR_SHIFT, + lim->max_hw_sectors); + lim->atomic_write_unit_min = + min(lim->atomic_write_hw_unit_min, unit_limit); + lim->atomic_write_unit_max = + min(lim->atomic_write_hw_unit_max, unit_limit); + lim->atomic_write_boundary_sectors = + lim->atomic_write_hw_boundary >> SECTOR_SHIFT; +} + +static void blk_validate_atomic_write_limits(struct queue_limits *lim) +{ + unsigned int boundary_sectors; + + if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) + goto unsupported; + + if (!lim->atomic_write_hw_max) + goto unsupported; + + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min))) + goto unsupported; + + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max))) + goto unsupported; + + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min > + lim->atomic_write_hw_unit_max)) + goto unsupported; + + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max > + lim->atomic_write_hw_max)) + goto unsupported; + + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; + + if (boundary_sectors) { + if (WARN_ON_ONCE(lim->atomic_write_hw_max > + lim->atomic_write_hw_boundary)) + goto unsupported; /* - * The Zone Append size is limited by the maximum I/O size - * and the zone size given that it can't span zones. + * A feature of boundary support is that it disallows bios to + * be merged which would result in a merged request which + * crosses either a chunk sector or atomic write HW boundary, + * even though chunk sectors may be just set for performance. + * For simplicity, disallow atomic writes for a chunk sector + * which is non-zero and smaller than atomic write HW boundary. + * Furthermore, chunk sectors must be a multiple of atomic + * write HW boundary. Otherwise boundary support becomes + * complicated. + * Devices which do not conform to these rules can be dealt + * with if and when they show up. */ - lim->max_zone_append_sectors = - min3(lim->max_hw_sectors, - lim->max_zone_append_sectors, - lim->chunk_sectors); + if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors)) + goto unsupported; + + /* + * The boundary size just needs to be a multiple of unit_max + * (and not necessarily a power-of-2), so this following check + * could be relaxed in future. + * Furthermore, if needed, unit_max could even be reduced so + * that it is compliant with a !power-of-2 boundary. + */ + if (!is_power_of_2(boundary_sectors)) + goto unsupported; } - return 0; + blk_atomic_writes_update_limits(lim); + return; + +unsupported: + lim->atomic_write_max_sectors = 0; + lim->atomic_write_boundary_sectors = 0; + lim->atomic_write_unit_min = 0; + lim->atomic_write_unit_max = 0; } /* * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. */ -static int blk_validate_limits(struct queue_limits *lim) +int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; + unsigned int logical_block_sectors; + unsigned long seg_size; + int err; /* * Unless otherwise specified, default to 512 byte logical blocks and a @@ -111,6 +262,10 @@ static int blk_validate_limits(struct queue_limits *lim) */ if (!lim->logical_block_size) lim->logical_block_size = SECTOR_SIZE; + else if (blk_validate_block_size(lim->logical_block_size)) { + pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size); + return -EINVAL; + } if (lim->physical_block_size < lim->logical_block_size) lim->physical_block_size = lim->logical_block_size; @@ -122,6 +277,13 @@ static int blk_validate_limits(struct queue_limits *lim) lim->io_min = lim->physical_block_size; /* + * The optimal I/O size may not be aligned to physical block size + * (because it may be limited by dma engines which have no clue about + * block size of the disks attached to them), so we round it down here. + */ + lim->io_opt = round_down(lim->io_opt, lim->physical_block_size); + + /* * max_hw_sectors has a somewhat weird default for historical reason, * but driver really should set their own instead of relying on this * value. @@ -134,8 +296,11 @@ static int blk_validate_limits(struct queue_limits *lim) lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) return -EINVAL; + logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT; + if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors)) + return -EINVAL; lim->max_hw_sectors = round_down(lim->max_hw_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * The actual max_sectors value is a complex beast and also takes the @@ -146,14 +311,20 @@ static int blk_validate_limits(struct queue_limits *lim) max_hw_sectors = min_not_zero(lim->max_hw_sectors, lim->max_dev_sectors); if (lim->max_user_sectors) { - if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { + lim->max_sectors = + min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT); + } else if (lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { + lim->max_sectors = + min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT); } else { lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } lim->max_sectors = round_down(lim->max_sectors, - lim->logical_block_size >> SECTOR_SHIFT); + logical_block_sectors); /* * Random default for the maximum number of segments. Driver should not @@ -178,7 +349,7 @@ static int blk_validate_limits(struct queue_limits *lim) */ if (!lim->seg_boundary_mask) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1)) return -EINVAL; /* @@ -188,7 +359,10 @@ static int blk_validate_limits(struct queue_limits *lim) * bvec and lower layer bio splitting is supposed to handle the two * correctly. */ - if (!lim->virt_boundary_mask) { + if (lim->virt_boundary_mask) { + if (!lim->max_segment_size) + lim->max_segment_size = UINT_MAX; + } else { /* * The maximum segment size has an odd historic 64k default that * drivers probably should override. Just like the I/O size we @@ -196,10 +370,17 @@ static int blk_validate_limits(struct queue_limits *lim) */ if (!lim->max_segment_size) lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE)) return -EINVAL; } + /* setup min segment size for building new segment in fast path */ + if (lim->seg_boundary_mask > lim->max_segment_size - 1) + seg_size = lim->max_segment_size; + else + seg_size = lim->seg_boundary_mask + 1; + lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + /* * We require drivers to at least do logical block aligned I/O, but * historically could not check for that due to the separate calls @@ -213,11 +394,20 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->alignment_offset) { lim->alignment_offset &= (lim->physical_block_size - 1); - lim->misaligned = 0; + lim->flags &= ~BLK_FLAG_MISALIGNED; } + if (!(lim->features & BLK_FEAT_WRITE_CACHE)) + lim->features &= ~BLK_FEAT_FUA; + + blk_validate_atomic_write_limits(lim); + + err = blk_validate_integrity_limits(lim); + if (err) + return err; return blk_validate_zoned_limits(lim); } +EXPORT_SYMBOL_GPL(blk_validate_limits); /* * Set the default limits for a newly allocated queue. @lim contains the @@ -241,27 +431,63 @@ int blk_set_default_limits(struct queue_limits *lim) * @lim: limits to apply * * Apply the limits in @lim that were obtained from queue_limits_start_update() - * and updated by the caller to @q. + * and updated by the caller to @q. The caller must have frozen the queue or + * ensure that there are no outstanding I/Os by other means. * * Returns 0 if successful, else a negative error code. */ int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim) - __releases(q->limits_lock) { - int error = blk_validate_limits(lim); + int error; + + error = blk_validate_limits(lim); + if (error) + goto out_unlock; - if (!error) { - q->limits = *lim; - if (q->disk) - blk_apply_bdi_limits(q->disk->bdi, lim); +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (q->crypto_profile && lim->integrity.tag_size) { + pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together.\n"); + error = -EINVAL; + goto out_unlock; } +#endif + + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); +out_unlock: mutex_unlock(&q->limits_lock); return error; } EXPORT_SYMBOL_GPL(queue_limits_commit_update); /** + * queue_limits_commit_update_frozen - commit an atomic update of queue limits + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were obtained from queue_limits_start_update() + * and updated with the new values by the caller to @q. Freezes the queue + * before the update and unfreezes it after. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_commit_update_frozen(struct request_queue *q, + struct queue_limits *lim) +{ + unsigned int memflags; + int ret; + + memflags = blk_mq_freeze_queue(q); + ret = queue_limits_commit_update(q, lim); + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen); + +/** * queue_limits_set - apply queue limits to queue * @q: queue to update * @lim: limits to apply @@ -279,446 +505,159 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) } EXPORT_SYMBOL_GPL(queue_limits_set); -/** - * blk_queue_bounce_limit - set bounce buffer limit for queue - * @q: the request queue for the device - * @bounce: bounce limit to enforce - * - * Description: - * Force bouncing for ISA DMA ranges or highmem. - * - * DEPRECATED, don't use in new code. - **/ -void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce) +static int queue_limit_alignment_offset(const struct queue_limits *lim, + sector_t sector) { - q->limits.bounce = bounce; + unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) + << SECTOR_SHIFT; + + return (granularity + lim->alignment_offset - alignment) % granularity; } -EXPORT_SYMBOL(blk_queue_bounce_limit); -/** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_hw_sectors: max hardware sectors in the usual 512b unit - * - * Description: - * Enables a low level driver to set a hard upper limit, - * max_hw_sectors, on the size of requests. max_hw_sectors is set by - * the device driver based upon the capabilities of the I/O - * controller. - * - * max_dev_sectors is a hard limit imposed by the storage device for - * READ/WRITE requests. It is set by the disk driver. - * - * max_sectors is a soft limit imposed by the block layer for - * filesystem type requests. This value can be overridden on a - * per-device basis in /sys/block/<device>/queue/max_sectors_kb. - * The soft limit can not exceed max_hw_sectors. - **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +static unsigned int queue_limit_discard_alignment( + const struct queue_limits *lim, sector_t sector) { - struct queue_limits *limits = &q->limits; - unsigned int max_sectors; - - if ((max_hw_sectors << 9) < PAGE_SIZE) { - max_hw_sectors = 1 << (PAGE_SHIFT - 9); - pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors); - } + unsigned int alignment, granularity, offset; - max_hw_sectors = round_down(max_hw_sectors, - limits->logical_block_size >> SECTOR_SHIFT); - limits->max_hw_sectors = max_hw_sectors; + if (!lim->max_discard_sectors) + return 0; - max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + /* Why are these in bytes, not sectors? */ + alignment = lim->discard_alignment >> SECTOR_SHIFT; + granularity = lim->discard_granularity >> SECTOR_SHIFT; - if (limits->max_user_sectors) - max_sectors = min(max_sectors, limits->max_user_sectors); - else - max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP); + /* Offset of the partition start in 'granularity' sectors */ + offset = sector_div(sector, granularity); - max_sectors = round_down(max_sectors, - limits->logical_block_size >> SECTOR_SHIFT); - limits->max_sectors = max_sectors; + /* And why do we do this modulus *again* in blkdev_issue_discard()? */ + offset = (granularity + alignment - offset) % granularity; - if (!q->disk) - return; - q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); + /* Turn it back into bytes, gaah */ + return offset << SECTOR_SHIFT; } -EXPORT_SYMBOL(blk_queue_max_hw_sectors); -/** - * blk_queue_chunk_sectors - set size of the chunk for this queue - * @q: the request queue for the device - * @chunk_sectors: chunk sectors in the usual 512b unit - * - * Description: - * If a driver doesn't want IOs to cross a given chunk size, it can set - * this limit and prevent merging across chunks. Note that the block layer - * must accept a page worth of data at any offset. So if the crossing of - * chunks is a hard limitation in the driver, it must still be prepared - * to split single page bios. - **/ -void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) +static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { - q->limits.chunk_sectors = chunk_sectors; + sectors = round_down(sectors, lbs >> SECTOR_SHIFT); + if (sectors < PAGE_SIZE >> SECTOR_SHIFT) + sectors = PAGE_SIZE >> SECTOR_SHIFT; + return sectors; } -EXPORT_SYMBOL(blk_queue_chunk_sectors); -/** - * blk_queue_max_discard_sectors - set max sectors for a single discard - * @q: the request queue for the device - * @max_discard_sectors: maximum number of sectors to discard - **/ -void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors) +/* Check if second and later bottom devices are compliant */ +static bool blk_stack_atomic_writes_tail(struct queue_limits *t, + struct queue_limits *b) { - struct queue_limits *lim = &q->limits; + /* We're not going to support different boundary sizes.. yet */ + if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary) + return false; - lim->max_hw_discard_sectors = max_discard_sectors; - lim->max_discard_sectors = - min(max_discard_sectors, lim->max_user_discard_sectors); -} -EXPORT_SYMBOL(blk_queue_max_discard_sectors); + /* Can't support this */ + if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max) + return false; -/** - * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase - * @q: the request queue for the device - * @max_sectors: maximum number of sectors to secure_erase - **/ -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors) -{ - q->limits.max_secure_erase_sectors = max_sectors; -} -EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); + /* Or this */ + if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min) + return false; -/** - * blk_queue_max_write_zeroes_sectors - set max sectors for a single - * write zeroes - * @q: the request queue for the device - * @max_write_zeroes_sectors: maximum number of sectors to write per command - **/ -void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_zeroes_sectors) -{ - q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; + t->atomic_write_hw_max = min(t->atomic_write_hw_max, + b->atomic_write_hw_max); + t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min, + b->atomic_write_hw_unit_min); + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, + b->atomic_write_hw_unit_max); + return true; } -EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); -/** - * blk_queue_max_zone_append_sectors - set max sectors for a single zone append - * @q: the request queue for the device - * @max_zone_append_sectors: maximum number of sectors to write per command - **/ -void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors) +/* Check for valid boundary of first bottom device */ +static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, + struct queue_limits *b) { - unsigned int max_sectors; - - if (WARN_ON(!blk_queue_is_zoned(q))) - return; - - max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); - max_sectors = min(q->limits.chunk_sectors, max_sectors); - /* - * Signal eventual driver bugs resulting in the max_zone_append sectors limit - * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, - * or the max_hw_sectors limit not set. + * Ensure atomic write boundary is aligned with chunk sectors. Stacked + * devices store chunk sectors in t->io_min. */ - WARN_ON(!max_sectors); + if (b->atomic_write_hw_boundary > t->io_min && + b->atomic_write_hw_boundary % t->io_min) + return false; + if (t->io_min > b->atomic_write_hw_boundary && + t->io_min % b->atomic_write_hw_boundary) + return false; - q->limits.max_zone_append_sectors = max_sectors; + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; + return true; } -EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); -/** - * blk_queue_max_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. - **/ -void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - pr_info("%s: set to minimum %u\n", __func__, max_segments); - } - - q->limits.max_segments = max_segments; -} -EXPORT_SYMBOL(blk_queue_max_segments); -/** - * blk_queue_max_discard_segments - set max segments for discard requests - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * segments in a discard request. - **/ -void blk_queue_max_discard_segments(struct request_queue *q, - unsigned short max_segments) +/* Check stacking of first bottom device */ +static bool blk_stack_atomic_writes_head(struct queue_limits *t, + struct queue_limits *b) { - q->limits.max_discard_segments = max_segments; -} -EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments); + if (b->atomic_write_hw_boundary && + !blk_stack_atomic_writes_boundary_head(t, b)) + return false; -/** - * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg - * @q: the request queue for the device - * @max_size: max size of segment in bytes - * - * Description: - * Enables a low level driver to set an upper limit on the size of a - * coalesced segment - **/ -void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) -{ - if (max_size < PAGE_SIZE) { - max_size = PAGE_SIZE; - pr_info("%s: set to minimum %u\n", __func__, max_size); + if (t->io_min <= SECTOR_SIZE) { + /* No chunk sectors, so use bottom device values directly */ + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; + t->atomic_write_hw_max = b->atomic_write_hw_max; + return true; } - /* see blk_queue_virt_boundary() for the explanation */ - WARN_ON_ONCE(q->limits.virt_boundary_mask); - - q->limits.max_segment_size = max_size; -} -EXPORT_SYMBOL(blk_queue_max_segment_size); - -/** - * blk_queue_logical_block_size - set logical block size for the queue - * @q: the request queue for the device - * @size: the logical block size, in bytes - * - * Description: - * This should be set to the lowest possible block size that the - * storage device can address. The default of 512 covers most - * hardware. - **/ -void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) -{ - struct queue_limits *limits = &q->limits; - - limits->logical_block_size = size; - - if (limits->discard_granularity < limits->logical_block_size) - limits->discard_granularity = limits->logical_block_size; - - if (limits->physical_block_size < size) - limits->physical_block_size = size; - - if (limits->io_min < limits->physical_block_size) - limits->io_min = limits->physical_block_size; - - limits->max_hw_sectors = - round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); - limits->max_sectors = - round_down(limits->max_sectors, size >> SECTOR_SHIFT); -} -EXPORT_SYMBOL(blk_queue_logical_block_size); - -/** - * blk_queue_physical_block_size - set physical block size for the queue - * @q: the request queue for the device - * @size: the physical block size, in bytes - * - * Description: - * This should be set to the lowest possible sector size that the - * hardware can operate on without reverting to read-modify-write - * operations. - */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) -{ - q->limits.physical_block_size = size; - - if (q->limits.physical_block_size < q->limits.logical_block_size) - q->limits.physical_block_size = q->limits.logical_block_size; - - if (q->limits.discard_granularity < q->limits.physical_block_size) - q->limits.discard_granularity = q->limits.physical_block_size; - - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; -} -EXPORT_SYMBOL(blk_queue_physical_block_size); - -/** - * blk_queue_zone_write_granularity - set zone write granularity for the queue - * @q: the request queue for the zoned device - * @size: the zone write granularity size, in bytes - * - * Description: - * This should be set to the lowest possible size allowing to write in - * sequential zones of a zoned block device. - */ -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size) -{ - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) - return; - - q->limits.zone_write_granularity = size; - - if (q->limits.zone_write_granularity < q->limits.logical_block_size) - q->limits.zone_write_granularity = q->limits.logical_block_size; -} -EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); + /* + * Find values for limits which work for chunk size. + * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk + * size (t->io_min), as chunk size is not restricted to a power-of-2. + * So we need to find highest power-of-2 which works for the chunk + * size. + * As an example scenario, we could have b->unit_max = 16K and + * t->io_min = 24K. For this case, reduce t->unit_max to a value + * aligned with both limits, i.e. 8K in this example. + */ + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + while (t->io_min % t->atomic_write_hw_unit_max) + t->atomic_write_hw_unit_max /= 2; -/** - * blk_queue_alignment_offset - set physical block alignment offset - * @q: the request queue for the device - * @offset: alignment offset in bytes - * - * Description: - * Some devices are naturally misaligned to compensate for things like - * the legacy DOS partition table 63-sector offset. Low-level drivers - * should call this function for devices whose first sector is not - * naturally aligned. - */ -void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) -{ - q->limits.alignment_offset = - offset & (q->limits.physical_block_size - 1); - q->limits.misaligned = 0; -} -EXPORT_SYMBOL(blk_queue_alignment_offset); + t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, + t->atomic_write_hw_unit_max); + t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); -void disk_update_readahead(struct gendisk *disk) -{ - blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); + return true; } -EXPORT_SYMBOL_GPL(disk_update_readahead); -/** - * blk_limits_io_min - set minimum request size for a device - * @limits: the queue limits - * @min: smallest I/O size in bytes - * - * Description: - * Some devices have an internal block size bigger than the reported - * hardware sector size. This function can be used to signal the - * smallest I/O the device can perform without incurring a performance - * penalty. - */ -void blk_limits_io_min(struct queue_limits *limits, unsigned int min) +static void blk_stack_atomic_writes_limits(struct queue_limits *t, + struct queue_limits *b, sector_t start) { - limits->io_min = min; + if (!(b->features & BLK_FEAT_ATOMIC_WRITES)) + goto unsupported; - if (limits->io_min < limits->logical_block_size) - limits->io_min = limits->logical_block_size; + if (!b->atomic_write_hw_unit_min) + goto unsupported; - if (limits->io_min < limits->physical_block_size) - limits->io_min = limits->physical_block_size; -} -EXPORT_SYMBOL(blk_limits_io_min); + if (!blk_atomic_write_start_sect_aligned(start, b)) + goto unsupported; -/** - * blk_queue_io_min - set minimum request size for the queue - * @q: the request queue for the device - * @min: smallest I/O size in bytes - * - * Description: - * Storage devices may report a granularity or preferred minimum I/O - * size which is the smallest request the device can perform without - * incurring a performance penalty. For disk drives this is often the - * physical block size. For RAID arrays it is often the stripe chunk - * size. A properly aligned multiple of minimum_io_size is the - * preferred request size for workloads where a high number of I/O - * operations is desired. - */ -void blk_queue_io_min(struct request_queue *q, unsigned int min) -{ - blk_limits_io_min(&q->limits, min); -} -EXPORT_SYMBOL(blk_queue_io_min); - -/** - * blk_limits_io_opt - set optimal request size for a device - * @limits: the queue limits - * @opt: smallest I/O size in bytes - * - * Description: - * Storage devices may report an optimal I/O size, which is the - * device's preferred unit for sustained I/O. This is rarely reported - * for disk drives. For RAID arrays it is usually the stripe width or - * the internal track size. A properly aligned multiple of - * optimal_io_size is the preferred request size for workloads where - * sustained throughput is desired. - */ -void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt) -{ - limits->io_opt = opt; -} -EXPORT_SYMBOL(blk_limits_io_opt); - -/** - * blk_queue_io_opt - set optimal request size for the queue - * @q: the request queue for the device - * @opt: optimal request size in bytes - * - * Description: - * Storage devices may report an optimal I/O size, which is the - * device's preferred unit for sustained I/O. This is rarely reported - * for disk drives. For RAID arrays it is usually the stripe width or - * the internal track size. A properly aligned multiple of - * optimal_io_size is the preferred request size for workloads where - * sustained throughput is desired. - */ -void blk_queue_io_opt(struct request_queue *q, unsigned int opt) -{ - blk_limits_io_opt(&q->limits, opt); - if (!q->disk) + /* + * If atomic_write_hw_max is set, we have already stacked 1x bottom + * device, so check for compliance. + */ + if (t->atomic_write_hw_max) { + if (!blk_stack_atomic_writes_tail(t, b)) + goto unsupported; return; - q->disk->bdi->ra_pages = - max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); -} -EXPORT_SYMBOL(blk_queue_io_opt); - -static int queue_limit_alignment_offset(const struct queue_limits *lim, - sector_t sector) -{ - unsigned int granularity = max(lim->physical_block_size, lim->io_min); - unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) - << SECTOR_SHIFT; - - return (granularity + lim->alignment_offset - alignment) % granularity; -} - -static unsigned int queue_limit_discard_alignment( - const struct queue_limits *lim, sector_t sector) -{ - unsigned int alignment, granularity, offset; - - if (!lim->max_discard_sectors) - return 0; - - /* Why are these in bytes, not sectors? */ - alignment = lim->discard_alignment >> SECTOR_SHIFT; - granularity = lim->discard_granularity >> SECTOR_SHIFT; - if (!granularity) - return 0; - - /* Offset of the partition start in 'granularity' sectors */ - offset = sector_div(sector, granularity); - - /* And why do we do this modulus *again* in blkdev_issue_discard()? */ - offset = (granularity + alignment - offset) % granularity; + } - /* Turn it back into bytes, gaah */ - return offset << SECTOR_SHIFT; -} + if (!blk_stack_atomic_writes_head(t, b)) + goto unsupported; + return; -static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) -{ - sectors = round_down(sectors, lbs >> SECTOR_SHIFT); - if (sectors < PAGE_SIZE >> SECTOR_SHIFT) - sectors = PAGE_SIZE >> SECTOR_SHIFT; - return sectors; +unsupported: + t->atomic_write_hw_max = 0; + t->atomic_write_hw_unit_max = 0; + t->atomic_write_hw_unit_min = 0; + t->atomic_write_hw_boundary = 0; } /** @@ -747,14 +686,30 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, { unsigned int top, bottom, alignment, ret = 0; + t->features |= (b->features & BLK_FEAT_INHERIT_MASK); + + /* + * Some feaures need to be supported both by the stacking driver and all + * underlying devices. The stacking driver sets these flags before + * stacking the limits, and this will clear the flags if any of the + * underlying devices does not support it. + */ + if (!(b->features & BLK_FEAT_NOWAIT)) + t->features &= ~BLK_FEAT_NOWAIT; + if (!(b->features & BLK_FEAT_POLL)) + t->features &= ~BLK_FEAT_POLL; + + t->flags |= (b->flags & BLK_FLAG_MISALIGNED); + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); + t->max_user_sectors = min_not_zero(t->max_user_sectors, + b->max_user_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); - t->max_zone_append_sectors = min(t->max_zone_append_sectors, - b->max_zone_append_sectors); - t->bounce = max(t->bounce, b->bounce); + t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, + b->max_hw_zone_append_sectors); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); @@ -770,8 +725,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); - t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is @@ -785,7 +738,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Verify that top and bottom intervals line up */ if (max(top, bottom) % min(top, bottom)) { - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } } @@ -807,42 +760,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } /* chunk_sectors a multiple of the physical block size? */ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { t->chunk_sectors = 0; - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } - t->raid_partial_stripes_expensive = - max(t->raid_partial_stripes_expensive, - b->raid_partial_stripes_expensive); - /* Find lowest common alignment_offset */ t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { - t->misaligned = 1; + t->flags |= BLK_FLAG_MISALIGNED; ret = -1; } @@ -854,16 +803,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, if (b->discard_granularity) { alignment = queue_limit_discard_alignment(b, start); - if (t->discard_granularity != 0 && - t->discard_alignment != alignment) { - top = t->discard_granularity + t->discard_alignment; - bottom = b->discard_granularity + alignment; - - /* Verify that top and bottom intervals line up */ - if ((max(top, bottom) % min(top, bottom)) != 0) - t->discard_misaligned = 1; - } - t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, @@ -877,11 +816,12 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_secure_erase_sectors); t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); - t->zoned = max(t->zoned, b->zoned); - if (!t->zoned) { + if (!(t->features & BLK_FEAT_ZONED)) { t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } + blk_stack_atomic_writes_limits(t, b, start); + return ret; } EXPORT_SYMBOL(blk_stack_limits); @@ -904,7 +844,7 @@ EXPORT_SYMBOL(blk_stack_limits); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx) { - if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, + if (blk_stack_limits(t, bdev_limits(bdev), get_start_sect(bdev) + offset)) pr_notice("%s: Warning: Device %pg is misaligned\n", pfx, bdev); @@ -912,96 +852,57 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); /** - * blk_queue_update_dma_pad - update pad mask - * @q: the request queue for the device - * @mask: pad mask - * - * Update dma pad mask. - * - * Appending pad buffer to a request modifies the last entry of a - * scatter list such that it includes the pad buffer. - **/ -void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) -{ - if (mask > q->dma_pad_mask) - q->dma_pad_mask = mask; -} -EXPORT_SYMBOL(blk_queue_update_dma_pad); - -/** - * blk_queue_segment_boundary - set boundary rules for segment merging - * @q: the request queue for the device - * @mask: the memory boundary mask - **/ -void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) -{ - if (mask < PAGE_SIZE - 1) { - mask = PAGE_SIZE - 1; - pr_info("%s: set to minimum %lx\n", __func__, mask); - } - - q->limits.seg_boundary_mask = mask; -} -EXPORT_SYMBOL(blk_queue_segment_boundary); - -/** - * blk_queue_virt_boundary - set boundary rules for bio merging - * @q: the request queue for the device - * @mask: the memory boundary mask - **/ -void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) -{ - q->limits.virt_boundary_mask = mask; - - /* - * Devices that require a virtual boundary do not support scatter/gather - * I/O natively, but instead require a descriptor list entry for each - * page (which might not be idential to the Linux PAGE_SIZE). Because - * of that they are not limited by our notion of "segment size". - */ - if (mask) - q->limits.max_segment_size = UINT_MAX; -} -EXPORT_SYMBOL(blk_queue_virt_boundary); - -/** - * blk_queue_dma_alignment - set dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask + * queue_limits_stack_integrity - stack integrity profile + * @t: target queue limits + * @b: base queue limits * - * description: - * set required memory and length alignment for direct dma transactions. - * this is used when building direct io requests for the queue. + * Check if the integrity profile in the @b can be stacked into the + * target @t. Stacking is possible if either: * - **/ -void blk_queue_dma_alignment(struct request_queue *q, int mask) -{ - q->limits.dma_alignment = mask; -} -EXPORT_SYMBOL(blk_queue_dma_alignment); - -/** - * blk_queue_update_dma_alignment - update dma length and memory alignment - * @q: the request queue for the device - * @mask: alignment mask + * a) does not have any integrity information stacked into it yet + * b) the integrity profile in @b is identical to the one in @t * - * description: - * update required memory and length alignment for direct dma transactions. - * If the requested alignment is larger than the current alignment, then - * the current queue alignment is updated to the new value, otherwise it - * is left alone. The design of this is to allow multiple objects - * (driver, device, transport etc) to set their respective - * alignments without having them interfere. - * - **/ -void blk_queue_update_dma_alignment(struct request_queue *q, int mask) -{ - BUG_ON(mask > PAGE_SIZE); + * If @b can be stacked into @t, return %true. Else return %false and clear the + * integrity information in @t. + */ +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b) +{ + struct blk_integrity *ti = &t->integrity; + struct blk_integrity *bi = &b->integrity; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return true; + + if (ti->flags & BLK_INTEGRITY_STACKED) { + if (ti->tuple_size != bi->tuple_size) + goto incompatible; + if (ti->interval_exp != bi->interval_exp) + goto incompatible; + if (ti->tag_size != bi->tag_size) + goto incompatible; + if (ti->csum_type != bi->csum_type) + goto incompatible; + if ((ti->flags & BLK_INTEGRITY_REF_TAG) != + (bi->flags & BLK_INTEGRITY_REF_TAG)) + goto incompatible; + } else { + ti->flags = BLK_INTEGRITY_STACKED; + ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | + (bi->flags & BLK_INTEGRITY_REF_TAG); + ti->csum_type = bi->csum_type; + ti->tuple_size = bi->tuple_size; + ti->pi_offset = bi->pi_offset; + ti->interval_exp = bi->interval_exp; + ti->tag_size = bi->tag_size; + } + return true; - if (mask > q->limits.dma_alignment) - q->limits.dma_alignment = mask; +incompatible: + memset(ti, 0, sizeof(*ti)); + return false; } -EXPORT_SYMBOL(blk_queue_update_dma_alignment); +EXPORT_SYMBOL_GPL(queue_limits_stack_integrity); /** * blk_set_queue_depth - tell the block layer about the device queue depth @@ -1016,92 +917,11 @@ void blk_set_queue_depth(struct request_queue *q, unsigned int depth) } EXPORT_SYMBOL(blk_set_queue_depth); -/** - * blk_queue_write_cache - configure queue's write cache - * @q: the request queue for the device - * @wc: write back cache on or off - * @fua: device supports FUA writes, if true - * - * Tell the block layer about the write cache of @q. - */ -void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) -{ - if (wc) { - blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); - blk_queue_flag_set(QUEUE_FLAG_WC, q); - } else { - blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); - blk_queue_flag_clear(QUEUE_FLAG_WC, q); - } - if (fua) - blk_queue_flag_set(QUEUE_FLAG_FUA, q); - else - blk_queue_flag_clear(QUEUE_FLAG_FUA, q); -} -EXPORT_SYMBOL_GPL(blk_queue_write_cache); - -/** - * blk_queue_required_elevator_features - Set a queue required elevator features - * @q: the request queue for the target device - * @features: Required elevator features OR'ed together - * - * Tell the block layer that for the device controlled through @q, only the - * only elevators that can be used are those that implement at least the set of - * features specified by @features. - */ -void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features) -{ - q->required_elevator_features = features; -} -EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); - -/** - * blk_queue_can_use_dma_map_merging - configure queue for merging segments. - * @q: the request queue for the device - * @dev: the device pointer for dma - * - * Tell the block layer about merging the segments by dma map of @q. - */ -bool blk_queue_can_use_dma_map_merging(struct request_queue *q, - struct device *dev) -{ - unsigned long boundary = dma_get_merge_boundary(dev); - - if (!boundary) - return false; - - /* No need to update max_segment_size. see blk_queue_virt_boundary() */ - blk_queue_virt_boundary(q, boundary); - - return true; -} -EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); - -/** - * disk_set_zoned - inidicate a zoned device - * @disk: gendisk to configure - */ -void disk_set_zoned(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - - WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - q->limits.zoned = true; - blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); -} -EXPORT_SYMBOL_GPL(disk_set_zoned); - int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - if (q->limits.misaligned) + if (q->limits.flags & BLK_FLAG_MISALIGNED) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, diff --git a/block/blk-stat.c b/block/blk-stat.c index e42c263e53fb..46449da856f8 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now) value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; - if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE) - blk_throtl_stat_add(rq, value); - rcu_read_lock(); cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { @@ -165,7 +162,7 @@ void blk_stat_remove_callback(struct request_queue *q, blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); - del_timer_sync(&cb->timer); + timer_delete_sync(&cb->timer); } static void blk_stat_free_callback_rcu(struct rcu_head *head) diff --git a/block/blk-stat.h b/block/blk-stat.h index 17e1eb4ec7e2..9e05bf18d1be 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,7 +64,6 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); -bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); @@ -149,7 +148,7 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, static inline void blk_stat_deactivate(struct blk_stat_callback *cb) { - del_timer_sync(&cb->timer); + timer_delete_sync(&cb->timer); } /** diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8c8f69d8ba48..b2b9b89d6967 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -22,14 +22,18 @@ struct queue_sysfs_entry { struct attribute attr; - ssize_t (*show)(struct request_queue *, char *); - ssize_t (*store)(struct request_queue *, const char *, size_t); + ssize_t (*show)(struct gendisk *disk, char *page); + ssize_t (*show_limit)(struct gendisk *disk, char *page); + + ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); + int (*store_limit)(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim); }; static ssize_t queue_var_show(unsigned long var, char *page) { - return sprintf(page, "%lu\n", var); + return sysfs_emit(page, "%lu\n", var); } static ssize_t @@ -47,16 +51,23 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_requests_show(struct request_queue *q, char *page) +static ssize_t queue_requests_show(struct gendisk *disk, char *page) { - return queue_var_show(q->nr_requests, page); + ssize_t ret; + + mutex_lock(&disk->queue->elevator_lock); + ret = queue_var_show(disk->queue->nr_requests, page); + mutex_unlock(&disk->queue->elevator_lock); + return ret; } static ssize_t -queue_requests_store(struct request_queue *q, const char *page, size_t count) +queue_requests_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nr; int ret, err; + unsigned int memflags; + struct request_queue *q = disk->queue; if (!queue_is_mq(q)) return -EINVAL; @@ -65,307 +76,294 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - err = blk_mq_update_nr_requests(q, nr); + err = blk_mq_update_nr_requests(disk->queue, nr); if (err) - return err; - + ret = err; + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); return ret; } -static ssize_t queue_ra_show(struct request_queue *q, char *page) +static ssize_t queue_ra_show(struct gendisk *disk, char *page) { - unsigned long ra_kb; + ssize_t ret; - if (!q->disk) - return -EINVAL; - ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); - return queue_var_show(ra_kb, page); + mutex_lock(&disk->queue->limits_lock); + ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); + mutex_unlock(&disk->queue->limits_lock); + + return ret; } static ssize_t -queue_ra_store(struct request_queue *q, const char *page, size_t count) +queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; + unsigned int memflags; + struct request_queue *q = disk->queue; - if (!q->disk) - return -EINVAL; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); - return ret; -} - -static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) -{ - int max_sectors_kb = queue_max_sectors(q) >> 1; - - return queue_var_show(max_sectors_kb, page); -} - -static ssize_t queue_max_segments_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_max_segments(q), page); -} - -static ssize_t queue_max_discard_segments_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_max_discard_segments(q), page); -} - -static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.max_integrity_segments, page); -} - -static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_max_segment_size(q), page); -} - -static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_logical_block_size(q), page); -} - -static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_physical_block_size(q), page); -} + /* + * ->ra_pages is protected by ->limits_lock because it is usually + * calculated from the queue limits by queue_limits_commit_update. + */ + mutex_lock(&q->limits_lock); + memflags = blk_mq_freeze_queue(q); + disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + mutex_unlock(&q->limits_lock); + blk_mq_unfreeze_queue(q, memflags); -static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.chunk_sectors, page); + return ret; } -static ssize_t queue_io_min_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_io_min(q), page); +#define QUEUE_SYSFS_LIMIT_SHOW(_field) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ +{ \ + return queue_var_show(disk->queue->limits._field, page); \ +} + +QUEUE_SYSFS_LIMIT_SHOW(max_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments) +QUEUE_SYSFS_LIMIT_SHOW(max_segment_size) +QUEUE_SYSFS_LIMIT_SHOW(max_write_streams) +QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity) +QUEUE_SYSFS_LIMIT_SHOW(logical_block_size) +QUEUE_SYSFS_LIMIT_SHOW(physical_block_size) +QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors) +QUEUE_SYSFS_LIMIT_SHOW(io_min) +QUEUE_SYSFS_LIMIT_SHOW(io_opt) +QUEUE_SYSFS_LIMIT_SHOW(discard_granularity) +QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity) +QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask) +QUEUE_SYSFS_LIMIT_SHOW(dma_alignment) +QUEUE_SYSFS_LIMIT_SHOW(max_open_zones) +QUEUE_SYSFS_LIMIT_SHOW(max_active_zones) +QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min) +QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max) + +#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ +{ \ + return sysfs_emit(page, "%llu\n", \ + (unsigned long long)disk->queue->limits._field << \ + SECTOR_SHIFT); \ } -static ssize_t queue_io_opt_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_io_opt(q), page); -} +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) -static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.discard_granularity, page); +#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \ +static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ +{ \ + return queue_var_show(disk->queue->limits._field >> 1, page); \ } -static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) -{ +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors) - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_hw_discard_sectors << 9); +#define QUEUE_SYSFS_SHOW_CONST(_name, _val) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ +{ \ + return sysfs_emit(page, "%d\n", _val); \ } -static ssize_t queue_discard_max_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_discard_sectors << 9); -} +/* deprecated fields */ +QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) +QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) +QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) -static ssize_t queue_discard_max_store(struct request_queue *q, - const char *page, size_t count) +static int queue_max_discard_sectors_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) { unsigned long max_discard_bytes; - struct queue_limits lim; ssize_t ret; - int err; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; - if (max_discard_bytes & (q->limits.discard_granularity - 1)) + if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1)) return -EINVAL; if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - blk_mq_freeze_queue(q); - lim = queue_limits_start_update(q); - lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; - err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); - - if (err) - return err; - return ret; -} - -static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) -{ - return queue_var_show(0, page); -} - -static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) -{ - return queue_var_show(0, page); -} - -static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_write_zeroes_sectors << 9); -} - -static ssize_t queue_zone_write_granularity_show(struct request_queue *q, - char *page) -{ - return queue_var_show(queue_zone_write_granularity(q), page); -} - -static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) -{ - unsigned long long max_sectors = q->limits.max_zone_append_sectors; - - return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); + lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + return 0; } -static ssize_t -queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) +static int +queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, + struct queue_limits *lim) { unsigned long max_sectors_kb; - struct queue_limits lim; ssize_t ret; - int err; ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; - blk_mq_freeze_queue(q); - lim = queue_limits_start_update(q); - lim.max_user_sectors = max_sectors_kb << 1; - err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); - if (err) - return err; - return ret; + lim->max_user_sectors = max_sectors_kb << 1; + return 0; } -static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) +static ssize_t queue_feature_store(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim, blk_features_t feature) { - int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; - - return queue_var_show(max_hw_sectors_kb, page); -} + unsigned long val; + ssize_t ret; -static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) -{ - return queue_var_show(q->limits.virt_boundary_mask, page); -} + ret = queue_var_store(&val, page, count); + if (ret < 0) + return ret; -static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) -{ - return queue_var_show(queue_dma_alignment(q), page); + if (val) + lim->features |= feature; + else + lim->features &= ~feature; + return 0; } -#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ -static ssize_t \ -queue_##name##_show(struct request_queue *q, char *page) \ +#define QUEUE_SYSFS_FEATURE(_name, _feature) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - int bit; \ - bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ - return queue_var_show(neg ? !bit : bit, page); \ + return sysfs_emit(page, "%u\n", \ + !!(disk->queue->limits.features & _feature)); \ } \ -static ssize_t \ -queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ +static int queue_##_name##_store(struct gendisk *disk, \ + const char *page, size_t count, struct queue_limits *lim) \ +{ \ + return queue_feature_store(disk, page, count, lim, _feature); \ +} + +QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) +QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM) +QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT) +QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES); + +#define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \ +static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ { \ - unsigned long val; \ - ssize_t ret; \ - ret = queue_var_store(&val, page, count); \ - if (ret < 0) \ - return ret; \ - if (neg) \ - val = !val; \ - \ - if (val) \ - blk_queue_flag_set(QUEUE_FLAG_##flag, q); \ - else \ - blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \ - return ret; \ + return sysfs_emit(page, "%u\n", \ + !!(disk->queue->limits.features & _feature)); \ } -QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); -QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); -QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); -QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); -#undef QUEUE_SYSFS_BIT_FNS +QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); +QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); + +static ssize_t queue_poll_show(struct gendisk *disk, char *page) +{ + if (queue_is_mq(disk->queue)) + return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); + + return sysfs_emit(page, "%u\n", + !!(disk->queue->limits.features & BLK_FEAT_POLL)); +} -static ssize_t queue_zoned_show(struct request_queue *q, char *page) +static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { - if (blk_queue_is_zoned(q)) - return sprintf(page, "host-managed\n"); - return sprintf(page, "none\n"); + if (blk_queue_is_zoned(disk->queue)) + return sysfs_emit(page, "host-managed\n"); + return sysfs_emit(page, "none\n"); } -static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) +static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) { - return queue_var_show(disk_nr_zones(q->disk), page); + return queue_var_show(disk_nr_zones(disk), page); } -static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) +static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { - return queue_var_show(bdev_max_open_zones(q->disk->part0), page); + return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } -static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) +static int queue_iostats_passthrough_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) { - return queue_var_show(bdev_max_active_zones(q->disk->part0), page); + unsigned long ios; + ssize_t ret; + + ret = queue_var_store(&ios, page, count); + if (ret < 0) + return ret; + + if (ios) + lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; + else + lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; + return 0; } -static ssize_t queue_nomerges_show(struct request_queue *q, char *page) +static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { - return queue_var_show((blk_queue_nomerges(q) << 1) | - blk_queue_noxmerges(q), page); + return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | + blk_queue_noxmerges(disk->queue), page); } -static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, +static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; + unsigned int memflags; + struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; + memflags = blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_mq_unfreeze_queue(q, memflags); return ret; } -static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) +static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page) { - bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); - bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); + bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags); return queue_var_show(set << force, page); } static ssize_t -queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) +queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) { ssize_t ret = -EINVAL; #ifdef CONFIG_SMP + struct request_queue *q = disk->queue; unsigned long val; + unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) return ret; + /* + * Here we update two queue flags each using atomic bitops, although + * updating two flags isn't atomic it should be harmless as those flags + * are accessed individually using atomic test_bit operation. So we + * don't grab any lock while updating these flags. + */ + memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); @@ -376,89 +374,87 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } + blk_mq_unfreeze_queue(q, memflags); #endif return ret; } -static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%d\n", -1); -} - -static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, +static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, size_t count) { return count; } -static ssize_t queue_poll_show(struct request_queue *q, char *page) -{ - return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); -} - -static ssize_t queue_poll_store(struct request_queue *q, const char *page, +static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return -EINVAL; + unsigned int memflags; + ssize_t ret = count; + struct request_queue *q = disk->queue; + + memflags = blk_mq_freeze_queue(q); + if (!(q->limits.features & BLK_FEAT_POLL)) { + ret = -EINVAL; + goto out; + } + pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); - return count; +out: + blk_mq_unfreeze_queue(q, memflags); + return ret; } -static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) +static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) { - return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout)); + return sysfs_emit(page, "%u\n", + jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout))); } -static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, +static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int val; + unsigned int val, memflags; int err; + struct request_queue *q = disk->queue; err = kstrtou32(page, 10, &val); if (err || val == 0) return -EINVAL; + memflags = blk_mq_freeze_queue(q); blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + blk_mq_unfreeze_queue(q, memflags); return count; } -static ssize_t queue_wc_show(struct request_queue *q, char *page) +static ssize_t queue_wc_show(struct gendisk *disk, char *page) { - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) - return sprintf(page, "write back\n"); - - return sprintf(page, "write through\n"); + if (blk_queue_write_cache(disk->queue)) + return sysfs_emit(page, "write back\n"); + return sysfs_emit(page, "write through\n"); } -static ssize_t queue_wc_store(struct request_queue *q, const char *page, - size_t count) +static int queue_wc_store(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim) { + bool disable; + if (!strncmp(page, "write back", 10)) { - if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) - return -EINVAL; - blk_queue_flag_set(QUEUE_FLAG_WC, q); + disable = false; } else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) { - blk_queue_flag_clear(QUEUE_FLAG_WC, q); + !strncmp(page, "none", 4)) { + disable = true; } else { return -EINVAL; } - return count; -} - -static ssize_t queue_fua_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags)); -} - -static ssize_t queue_dax_show(struct request_queue *q, char *page) -{ - return queue_var_show(blk_queue_dax(q), page); + if (disable) + lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED; + else + lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; + return 0; } #define QUEUE_RO_ENTRY(_prefix, _name) \ @@ -474,62 +470,80 @@ static struct queue_sysfs_entry _prefix##_entry = { \ .store = _prefix##_store, \ }; +#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0444 }, \ + .show_limit = _prefix##_show, \ +} + +#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show_limit = _prefix##_show, \ + .store_limit = _prefix##_store, \ +} + QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); -QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); -QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); -QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); -QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); +QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); +QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams"); +QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity"); QUEUE_RW_ENTRY(elv_iosched, "scheduler"); -QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); -QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); -QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); -QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); -QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); +QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size"); +QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); +QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size"); +QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size"); -QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); -QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); -QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); -QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); +QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors, + "atomic_write_boundary_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); + QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); -QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); -QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); +QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); -QUEUE_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); -QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); -QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); +QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); +QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); -QUEUE_RW_ENTRY(queue_wc, "write_cache"); -QUEUE_RO_ENTRY(queue_fua, "fua"); -QUEUE_RO_ENTRY(queue_dax, "dax"); +QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); +QUEUE_LIM_RO_ENTRY(queue_fua, "fua"); +QUEUE_LIM_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); -QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); -QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); - -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); -#endif +QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); +QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { - .attr = {.name = "hw_sector_size", .mode = 0444 }, - .show = queue_logical_block_size_show, + .attr = {.name = "hw_sector_size", .mode = 0444 }, + .show_limit = queue_logical_block_size_show, }; -QUEUE_RW_ENTRY(queue_nonrot, "rotational"); -QUEUE_RW_ENTRY(queue_iostats, "iostats"); -QUEUE_RW_ENTRY(queue_random, "add_random"); -QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); +QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); +QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats"); +QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random"); +QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT static ssize_t queue_var_store64(s64 *var, const char *page) @@ -545,23 +559,36 @@ static ssize_t queue_var_store64(s64 *var, const char *page) return 0; } -static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) { - if (!wbt_rq_qos(q)) - return -EINVAL; + ssize_t ret; + struct request_queue *q = disk->queue; - if (wbt_disabled(q)) - return sprintf(page, "0\n"); + mutex_lock(&disk->rqos_state_mutex); + if (!wbt_rq_qos(q)) { + ret = -EINVAL; + goto out; + } + + if (wbt_disabled(q)) { + ret = sysfs_emit(page, "0\n"); + goto out; + } - return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); + ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +out: + mutex_unlock(&disk->rqos_state_mutex); + return ret; } -static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, +static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { + struct request_queue *q = disk->queue; struct rq_qos *rqos; ssize_t ret; s64 val; + unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) @@ -569,35 +596,40 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, if (val < -1) return -EINVAL; + memflags = blk_mq_freeze_queue(q); + rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(q->disk); + ret = wbt_init(disk); if (ret) - return ret; + goto out; } + ret = count; if (val == -1) val = wbt_default_latency_nsec(q); else if (val >= 0) val *= 1000ULL; if (wbt_get_min_lat(q) == val) - return count; + goto out; /* * Ensure that the queue is idled, in case the latency update * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ - blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); + mutex_lock(&disk->rqos_state_mutex); wbt_set_min_lat(q, val); + mutex_unlock(&disk->rqos_state_mutex); blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); +out: + blk_mq_unfreeze_queue(q, memflags); - return count; + return ret; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); @@ -605,13 +637,17 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); /* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { - &queue_ra_entry.attr, + /* + * Attributes which are protected with q->limits_lock. + */ &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, + &queue_max_write_streams_entry.attr, + &queue_write_stream_granularity_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -619,44 +655,60 @@ static struct attribute *queue_attrs[] = { &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, - &queue_discard_max_entry.attr, - &queue_discard_max_hw_entry.attr, - &queue_discard_zeroes_data_entry.attr, - &queue_write_same_max_entry.attr, - &queue_write_zeroes_max_entry.attr, - &queue_zone_append_max_entry.attr, + &queue_max_discard_sectors_entry.attr, + &queue_max_hw_discard_sectors_entry.attr, + &queue_atomic_write_max_sectors_entry.attr, + &queue_atomic_write_boundary_sectors_entry.attr, + &queue_atomic_write_unit_min_entry.attr, + &queue_atomic_write_unit_max_entry.attr, + &queue_max_write_zeroes_sectors_entry.attr, + &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, - &queue_nonrot_entry.attr, + &queue_rotational_entry.attr, &queue_zoned_entry.attr, - &queue_nr_zones_entry.attr, &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, - &queue_nomerges_entry.attr, + &queue_iostats_passthrough_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, - &queue_random_entry.attr, - &queue_poll_entry.attr, + &queue_add_random_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, - &queue_poll_delay_entry.attr, -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - &blk_throtl_sample_time_entry.attr, -#endif &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, + &queue_ra_entry.attr, + + /* + * Attributes which don't require locking. + */ + &queue_discard_zeroes_data_entry.attr, + &queue_write_same_max_entry.attr, + &queue_nr_zones_entry.attr, + &queue_nomerges_entry.attr, + &queue_poll_entry.attr, + &queue_poll_delay_entry.attr, + NULL, }; /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { - &queue_requests_entry.attr, + /* + * Attributes which require some form of locking other than + * q->sysfs_lock. + */ &elv_iosched_entry.attr, - &queue_rq_affinity_entry.attr, - &queue_io_timeout_entry.attr, + &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif + /* + * Attributes which don't require locking. + */ + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, + NULL, }; @@ -706,15 +758,20 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); - struct request_queue *q = disk->queue; - ssize_t res; - if (!entry->show) + if (!entry->show && !entry->show_limit) return -EIO; - mutex_lock(&q->sysfs_lock); - res = entry->show(q, page); - mutex_unlock(&q->sysfs_lock); - return res; + + if (entry->show_limit) { + ssize_t res; + + mutex_lock(&disk->queue->limits_lock); + res = entry->show_limit(disk, page); + mutex_unlock(&disk->queue->limits_lock); + return res; + } + + return entry->show(disk, page); } static ssize_t @@ -724,15 +781,28 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; - ssize_t res; - if (!entry->store) + if (!entry->store_limit && !entry->store) return -EIO; - mutex_lock(&q->sysfs_lock); - res = entry->store(q, page, length); - mutex_unlock(&q->sysfs_lock); - return res; + if (entry->store_limit) { + ssize_t res; + + struct queue_limits lim = queue_limits_start_update(q); + + res = entry->store_limit(disk, page, length, &lim); + if (res < 0) { + queue_limits_cancel_update(q); + return res; + } + + res = queue_limits_commit_update_frozen(q, &lim); + if (res) + return res; + return length; + } + + return entry->store(disk, page, length); } static const struct sysfs_ops queue_sysfs_ops = { @@ -779,7 +849,6 @@ int blk_register_queue(struct gendisk *disk) struct request_queue *q = disk->queue; int ret; - mutex_lock(&q->sysfs_dir_lock); kobject_init(&disk->queue_kobj, &blk_queue_ktype); ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) @@ -802,26 +871,21 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_debugfs_remove; - if (q->elevator) { - ret = elv_register_queue(q, false); - if (ret) - goto out_unregister_ia_ranges; - } - ret = blk_crypto_sysfs_register(disk); if (ret) - goto out_elv_unregister; + goto out_unregister_ia_ranges; - blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + if (queue_is_mq(q)) + elevator_set_default(q); wbt_enable_default(disk); - blk_throtl_register(disk); + + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); /* * SCSI probing may synchronously create and destroy a lot of @@ -832,23 +896,20 @@ int blk_register_queue(struct gendisk *disk) * faster to shut down and is made fully functional here as * request_queues for non-existent devices never get registered. */ - if (!blk_queue_init_done(q)) { - blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); - percpu_ref_switch_to_percpu(&q->q_usage_counter); - } + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); return ret; -out_elv_unregister: - elv_unregister_queue(q); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); + if (queue_is_mq(q)) + blk_mq_sysfs_unregister(disk); out_put_queue_kobj: kobject_put(&disk->queue_kobj); - mutex_unlock(&q->sysfs_dir_lock); return ret; } @@ -879,7 +940,6 @@ void blk_unregister_queue(struct gendisk *disk) blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); mutex_unlock(&q->sysfs_lock); - mutex_lock(&q->sysfs_dir_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. @@ -889,14 +949,15 @@ void blk_unregister_queue(struct gendisk *disk) blk_crypto_sysfs_unregister(disk); mutex_lock(&q->sysfs_lock); - elv_unregister_queue(q); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); kobject_del(&disk->queue_kobj); - mutex_unlock(&q->sysfs_dir_lock); + + if (queue_is_mq(q)) + elevator_set_none(q); blk_debugfs_remove(disk); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f4850a6f860b..bd15357f23bd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -25,37 +25,12 @@ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) -#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ -#define MIN_THROTL_BPS (320 * 1024) -#define MIN_THROTL_IOPS (10) -#define DFL_LATENCY_TARGET (-1L) -#define DFL_IDLE_THRESHOLD (0) -#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */ -#define LATENCY_FILTERED_SSD (0) -/* - * For HD, very small latency comes from sequential IO. Such IO is helpless to - * help determine if its IO is impacted by others, hence we ignore the IO - */ -#define LATENCY_FILTERED_HD (1000L) /* 1ms */ /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* We measure latency for request size from <= 4k to >= 1M */ -#define LATENCY_BUCKET_SIZE 9 - -struct latency_bucket { - unsigned long total_latency; /* ns / 1024 */ - int samples; -}; - -struct avg_latency_bucket { - unsigned long latency; /* ns / 1024 */ - bool valid; -}; - struct throtl_data { /* service tree for active throtl groups */ @@ -70,19 +45,6 @@ struct throtl_data /* Work for dispatching throttled bios */ struct work_struct dispatch_work; - unsigned int limit_index; - bool limit_valid[LIMIT_CNT]; - - unsigned long low_upgrade_time; - unsigned long low_downgrade_time; - - unsigned int scale; - - struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; - struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; - struct latency_bucket __percpu *latency_buckets[2]; - unsigned long last_calculate_time; - unsigned long filtered_latency; bool track_bio_latency; }; @@ -126,89 +88,26 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) return container_of(sq, struct throtl_data, service_queue); } -/* - * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to - * make the IO dispatch more smooth. - * Scale up: linearly scale up according to elapsed time since upgrade. For - * every throtl_slice, the limit scales up 1/2 .low limit till the - * limit hits .max limit - * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit - */ -static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) -{ - /* arbitrary value to avoid too big scale */ - if (td->scale < 4096 && time_after_eq(jiffies, - td->low_upgrade_time + td->scale * td->throtl_slice)) - td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; - - return low + (low >> 1) * td->scale; -} - static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); - struct throtl_data *td; - uint64_t ret; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; - td = tg->td; - ret = tg->bps[rw][td->limit_index]; - if (ret == 0 && td->limit_index == LIMIT_LOW) { - /* intermediate node or iops isn't 0 */ - if (!list_empty(&blkg->blkcg->css.children) || - tg->iops[rw][td->limit_index]) - return U64_MAX; - else - return MIN_THROTL_BPS; - } - - if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && - tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { - uint64_t adjusted; - - adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); - ret = min(tg->bps[rw][LIMIT_MAX], adjusted); - } - return ret; + return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); - struct throtl_data *td; - unsigned int ret; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; - td = tg->td; - ret = tg->iops[rw][td->limit_index]; - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { - /* intermediate node or bps isn't 0 */ - if (!list_empty(&blkg->blkcg->css.children) || - tg->bps[rw][td->limit_index]) - return UINT_MAX; - else - return MIN_THROTL_IOPS; - } - - if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && - tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { - uint64_t adjusted; - - adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); - if (adjusted > UINT_MAX) - adjusted = UINT_MAX; - ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); - } - return ret; + return tg->iops[rw]; } -#define request_bucket_index(sectors) \ - clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) - /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported @@ -244,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio) static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); - bio_list_init(&qn->bios); + bio_list_init(&qn->bios_bps); + bio_list_init(&qn->bios_iops); qn->tg = tg; } @@ -252,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to - * @queued: the service_queue->queued[] list @qn belongs to + * @sq: the service_queue @qn belongs to * - * Add @bio to @qn and put @qn on @queued if it's not already on. + * Add @bio to @qn and put @qn on @sq->queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, - struct list_head *queued) + struct throtl_service_queue *sq) { - bio_list_add(&qn->bios, bio); + bool rw = bio_data_dir(bio); + + /* + * Split bios have already been throttled by bps, so they are + * directly queued into the iops path. + */ + if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) || + bio_flagged(bio, BIO_BPS_THROTTLED)) { + bio_list_add(&qn->bios_iops, bio); + sq->nr_queued_iops[rw]++; + } else { + bio_list_add(&qn->bios_bps, bio); + sq->nr_queued_bps[rw]++; + } + if (list_empty(&qn->node)) { - list_add_tail(&qn->node, queued); + list_add_tail(&qn->node, &sq->queued[rw]); blkg_get(tg_to_blkg(qn->tg)); } } @@ -271,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek + * + * Always take a bio from the head of the iops queue first. If the queue is + * empty, we then take it from the bps queue to maintain the overall idea of + * fetching bios from the head. */ static struct bio *throtl_peek_queued(struct list_head *queued) { @@ -281,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); - bio = bio_list_peek(&qn->bios); + bio = bio_list_peek(&qn->bios_iops); + if (!bio) + bio = bio_list_peek(&qn->bios_bps); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list - * @queued: the qnode list to pop a bio from + * @sq: the service_queue to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put + * @rw: read/write * - * Pop the first bio from the qnode list @queued. After popping, the first - * qnode is removed from @queued if empty or moved to the end of @queued so - * that the popping order is round-robin. + * Pop the first bio from the qnode list @sq->queued. Note that we firstly + * focus on the iops list because bios are ultimately dispatched from it. + * After popping, the first qnode is removed from @sq->queued if empty or moved + * to the end of @sq->queued so that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ -static struct bio *throtl_pop_queued(struct list_head *queued, - struct throtl_grp **tg_to_put) +static struct bio *throtl_pop_queued(struct throtl_service_queue *sq, + struct throtl_grp **tg_to_put, bool rw) { + struct list_head *queued = &sq->queued[rw]; struct throtl_qnode *qn; struct bio *bio; @@ -310,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued, return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); - bio = bio_list_pop(&qn->bios); + bio = bio_list_pop(&qn->bios_iops); + if (bio) { + sq->nr_queued_iops[rw]--; + } else { + bio = bio_list_pop(&qn->bios_bps); + if (bio) + sq->nr_queued_bps[rw]--; + } WARN_ON_ONCE(!bio); - if (bio_list_empty(&qn->bios)) { + if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; @@ -359,20 +289,10 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, } RB_CLEAR_NODE(&tg->rb_node); - tg->bps[READ][LIMIT_MAX] = U64_MAX; - tg->bps[WRITE][LIMIT_MAX] = U64_MAX; - tg->iops[READ][LIMIT_MAX] = UINT_MAX; - tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; - tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; - tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; - tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; - tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; - /* LIMIT_LOW will have default value 0 */ - - tg->latency_target = DFL_LATENCY_TARGET; - tg->latency_target_conf = DFL_LATENCY_TARGET; - tg->idletime_threshold = DFL_IDLE_THRESHOLD; - tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; + tg->bps[READ] = U64_MAX; + tg->bps[WRITE] = U64_MAX; + tg->iops[READ] = UINT_MAX; + tg->iops[WRITE] = UINT_MAX; return &tg->pd; @@ -418,18 +338,15 @@ static void throtl_pd_init(struct blkg_policy_data *pd) static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); - struct throtl_data *td = tg->td; int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || - (td->limit_valid[td->limit_index] && - tg_iops_limit(tg, rw) != UINT_MAX); + tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || - (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX)); + tg_bps_limit(tg, rw) != U64_MAX; } } @@ -443,54 +360,11 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static void blk_throtl_update_limit_valid(struct throtl_data *td) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - bool low_valid = false; - - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || - tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { - low_valid = true; - break; - } - } - rcu_read_unlock(); - - td->limit_valid[LIMIT_LOW] = low_valid; -} -#else -static inline void blk_throtl_update_limit_valid(struct throtl_data *td) -{ -} -#endif - -static void throtl_upgrade_state(struct throtl_data *td); -static void throtl_pd_offline(struct blkg_policy_data *pd) -{ - struct throtl_grp *tg = pd_to_tg(pd); - - tg->bps[READ][LIMIT_LOW] = 0; - tg->bps[WRITE][LIMIT_LOW] = 0; - tg->iops[READ][LIMIT_LOW] = 0; - tg->iops[WRITE][LIMIT_LOW] = 0; - - blk_throtl_update_limit_valid(tg->td); - - if (!tg->td->limit_valid[tg->td->limit_index]) - throtl_upgrade_state(tg->td); -} - static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); - del_timer_sync(&tg->service_queue.pending_timer); + timer_delete_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); kfree(tg); @@ -635,8 +509,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -655,16 +527,14 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, - bool clear_carryover) + bool clear) { - tg->bytes_disp[rw] = 0; - tg->io_disp[rw] = 0; + if (clear) { + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + } tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; - if (clear_carryover) { - tg->carryover_bytes[rw] = 0; - tg->carryover_ios[rw] = 0; - } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -681,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { + if (!time_before(tg->slice_end[rw], jiffy_end)) + return; + throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", @@ -697,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) return true; } +static unsigned int sq_queued(struct throtl_service_queue *sq, int type) +{ + return sq->nr_queued_bps[type] + sq->nr_queued_iops[type]; +} + static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { @@ -732,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } +static long long throtl_trim_bps(struct throtl_grp *tg, bool rw, + unsigned long time_elapsed) +{ + u64 bps_limit = tg_bps_limit(tg, rw); + long long bytes_trim; + + if (bps_limit == U64_MAX) + return 0; + + /* Need to consider the case of bytes_allowed overflow. */ + bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed); + if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) { + bytes_trim = tg->bytes_disp[rw]; + tg->bytes_disp[rw] = 0; + } else { + tg->bytes_disp[rw] -= bytes_trim; + } + + return bytes_trim; +} + +static int throtl_trim_iops(struct throtl_grp *tg, bool rw, + unsigned long time_elapsed) +{ + u32 iops_limit = tg_iops_limit(tg, rw); + int io_trim; + + if (iops_limit == UINT_MAX) + return 0; + + /* Need to consider the case of io_allowed overflow. */ + io_trim = calculate_io_allowed(iops_limit, time_elapsed); + if (io_trim <= 0 || tg->io_disp[rw] < io_trim) { + io_trim = tg->io_disp[rw]; + tg->io_disp[rw] = 0; + } else { + tg->io_disp[rw] -= io_trim; + } + + return io_trim; +} + /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { @@ -756,34 +676,28 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); - if (!time_elapsed) + /* Don't trim slice until at least 2 slices are used */ + if (time_elapsed < tg->td->throtl_slice * 2) return; - bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), - time_elapsed) + - tg->carryover_bytes[rw]; - io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + - tg->carryover_ios[rw]; - if (bytes_trim <= 0 && io_trim <= 0) + /* + * The bio submission time may be a few jiffies more than the expected + * waiting time, due to 'extra_bytes' can't be divided in + * tg_within_bps_limit(), and also due to timer wakeup delay. In this + * case, adjust slice_start will discard the extra wait time, causing + * lower rate than expected. Therefore, other than the above rounddown, + * one extra slice is preserved for deviation. + */ + time_elapsed -= tg->td->throtl_slice; + bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); + io_trim = throtl_trim_iops(tg, rw, time_elapsed); + if (!bytes_trim && !io_trim) return; - tg->carryover_bytes[rw] = 0; - if ((long long)tg->bytes_disp[rw] >= bytes_trim) - tg->bytes_disp[rw] -= bytes_trim; - else - tg->bytes_disp[rw] = 0; - - tg->carryover_ios[rw] = 0; - if ((int)tg->io_disp[rw] >= io_trim) - tg->io_disp[rw] -= io_trim; - else - tg->io_disp[rw] = 0; - tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, @@ -793,39 +707,60 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) jiffies); } -static void __tg_update_carryover(struct throtl_grp *tg, bool rw) +static void __tg_update_carryover(struct throtl_grp *tg, bool rw, + long long *bytes, int *ios) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); + long long bytes_allowed; + int io_allowed; + + /* + * If the queue is empty, carryover handling is not needed. In such cases, + * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch + * of subsequent bios. The same handling applies when the previous BPS/IOPS + * limit was set to max. + */ + if (sq_queued(&tg->service_queue, rw) == 0) { + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + return; + } /* * If config is updated while bios are still throttled, calculate and - * accumulate how many bytes/ios are waited across changes. And - * carryover_bytes/ios will be used to calculate new wait time under new - * configuration. + * accumulate how many bytes/ios are waited across changes. And use the + * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which + * will be used to calculate new wait time under new configuration. + * And we need to consider the case of bytes/io_allowed overflow. */ - if (bps_limit != U64_MAX) - tg->carryover_bytes[rw] += - calculate_bytes_allowed(bps_limit, jiffy_elapsed) - - tg->bytes_disp[rw]; - if (iops_limit != UINT_MAX) - tg->carryover_ios[rw] += - calculate_io_allowed(iops_limit, jiffy_elapsed) - - tg->io_disp[rw]; + if (bps_limit != U64_MAX) { + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed); + if (bytes_allowed > 0) + *bytes = bytes_allowed - tg->bytes_disp[rw]; + } + if (iops_limit != UINT_MAX) { + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed); + if (io_allowed > 0) + *ios = io_allowed - tg->io_disp[rw]; + } + + tg->bytes_disp[rw] = -*bytes; + tg->io_disp[rw] = -*ios; } static void tg_update_carryover(struct throtl_grp *tg) { - if (tg->service_queue.nr_queued[READ]) - __tg_update_carryover(tg, READ); - if (tg->service_queue.nr_queued[WRITE]) - __tg_update_carryover(tg, WRITE); + long long bytes[2] = {0}; + int ios[2] = {0}; - /* see comments in struct throtl_grp for meaning of these fields. */ + __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); + __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); + + /* see comments in struct throtl_grp for meaning of carryover. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, - tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], - tg->carryover_ios[READ], tg->carryover_ios[WRITE]); + bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, @@ -835,21 +770,19 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - if (iops_limit == UINT_MAX) { - return 0; - } - jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); - io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + - tg->carryover_ios[rw]; + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; + + /* make sure at least one io can be dispatched after waiting */ + jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } @@ -862,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); - /* no need to throttle if this bio's bytes have been accounted */ - if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { - return 0; - } - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ @@ -874,9 +802,10 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + - tg->carryover_bytes[rw]; - if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); + /* Need to consider the case of bytes_allowed overflow. */ + if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) + || bytes_allowed < 0) return 0; /* Calc approx time to dispatch */ @@ -894,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, return jiffy_wait; } +static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio) +{ + unsigned int bio_size = throtl_bio_data_size(bio); + + /* Charge the bio to the group */ + if (!bio_flagged(bio, BIO_BPS_THROTTLED) && + !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) { + bio_set_flag(bio, BIO_TG_BPS_THROTTLED); + tg->bytes_disp[bio_data_dir(bio)] += bio_size; + } +} + +static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio) +{ + bio_clear_flag(bio, BIO_TG_BPS_THROTTLED); + tg->io_disp[bio_data_dir(bio)]++; +} + /* - * Returns whether one can dispatch a bio or not. Also returns approx number - * of jiffies to wait before this bio is with-in IO rate and can be dispatched + * If previous slice expired, start a new one otherwise renew/extend existing + * slice to make sure it is at least throtl_slice interval long since now. New + * slice is started only for empty throttle group. If there is queued bio, that + * means there should be an active slice and it should be extended instead. */ -static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) +static void tg_update_slice(struct throtl_grp *tg, bool rw) +{ + if (throtl_slice_used(tg, rw) && + sq_queued(&tg->service_queue, rw) == 0) + throtl_start_new_slice(tg, rw, true); + else + throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); +} + +static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); + unsigned long bps_wait; + + /* no need to throttle if this bio's bytes have been accounted */ + if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING || + bio_flagged(bio, BIO_BPS_THROTTLED) || + bio_flagged(bio, BIO_TG_BPS_THROTTLED)) + return 0; + + tg_update_slice(tg, rw); + bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + throtl_extend_slice(tg, rw, jiffies + bps_wait); + + return bps_wait; +} + +static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); u32 iops_limit = tg_iops_limit(tg, rw); + unsigned long iops_wait; + + if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING) + return 0; + + tg_update_slice(tg, rw); + iops_wait = tg_within_iops_limit(tg, bio, iops_limit); + throtl_extend_slice(tg, rw, jiffies + iops_wait); + + return iops_wait; +} + +/* + * Returns approx number of jiffies to wait before this bio is with-in IO rate + * and can be moved to other queue or dispatched. + */ +static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + unsigned long wait; /* * Currently whole state machine of group depends on first bio @@ -912,65 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * this function with a different bio if there are other bios * queued. */ - BUG_ON(tg->service_queue.nr_queued[rw] && + BUG_ON(sq_queued(&tg->service_queue, rw) && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); - /* If tg->bps = -1, then BW is unlimited */ - if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || - tg->flags & THROTL_TG_CANCELING) { - if (wait) - *wait = 0; - return true; - } + wait = tg_dispatch_bps_time(tg, bio); + if (wait != 0) + return wait; /* - * If previous slice expired, start a new one otherwise renew/extend - * existing slice to make sure it is at least throtl_slice interval - * long since now. New slice is started only for empty throttle group. - * If there is queued bio, that means there should be an active - * slice and it should be extended instead. + * Charge bps here because @bio will be directly placed into the + * iops queue afterward. */ - if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw, true); - else { - if (time_before(tg->slice_end[rw], - jiffies + tg->td->throtl_slice)) - throtl_extend_slice(tg, rw, - jiffies + tg->td->throtl_slice); - } + throtl_charge_bps_bio(tg, bio); - bps_wait = tg_within_bps_limit(tg, bio, bps_limit); - iops_wait = tg_within_iops_limit(tg, bio, iops_limit); - if (bps_wait + iops_wait == 0) { - if (wait) - *wait = 0; - return true; - } - - max_wait = max(bps_wait, iops_wait); - - if (wait) - *wait = max_wait; - - if (time_before(tg->slice_end[rw], jiffies + max_wait)) - throtl_extend_slice(tg, rw, jiffies + max_wait); - - return false; -} - -static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) -{ - bool rw = bio_data_dir(bio); - unsigned int bio_size = throtl_bio_data_size(bio); - - /* Charge the bio to the group */ - if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { - tg->bytes_disp[rw] += bio_size; - tg->last_bytes_disp[rw] += bio_size; - } - - tg->io_disp[rw]++; - tg->last_io_disp[rw]++; + return tg_dispatch_iops_time(tg, bio); } /** @@ -997,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ - if (!sq->nr_queued[rw]) + if (sq_queued(sq, rw) == 0) tg->flags |= THROTL_TG_WAS_EMPTY; - throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); + throtl_qnode_add_bio(bio, qn, sq); + + /* + * Since we have split the queues, when the iops queue is + * previously empty and a new @bio is added into the first @qn, + * we also need to update the @tg->disptime. + */ + if (bio_flagged(bio, BIO_BPS_THROTTLED) && + bio == throtl_peek_queued(&sq->queued[rw])) + tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; - sq->nr_queued[rw]++; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; - unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + unsigned long read_wait = -1, write_wait = -1, min_wait, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) - tg_may_dispatch(tg, bio, &read_wait); + read_wait = tg_dispatch_time(tg, bio); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) - tg_may_dispatch(tg, bio, &write_wait); + write_wait = tg_dispatch_time(tg, bio); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; @@ -1030,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg) /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; + tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, @@ -1056,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ - bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); - sq->nr_queued[rw]--; + bio = throtl_pop_queued(sq, &tg_to_put, rw); - throtl_charge_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to @@ -1074,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], - &parent_sq->queued[rw]); + parent_sq); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } @@ -1096,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && - tg_may_dispatch(tg, bio, NULL)) { + tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, READ); nr_reads++; @@ -1106,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && - tg_may_dispatch(tg, bio, NULL)) { + tg_dispatch_time(tg, bio) == 0) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; @@ -1139,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; - if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) + if (sq_queued(sq, READ) || sq_queued(sq, WRITE)) tg_update_disptime(tg); else throtl_dequeue_tg(tg); @@ -1151,8 +1108,6 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) return nr_disp; } -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg); /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced @@ -1189,17 +1144,16 @@ static void throtl_pending_timer_fn(struct timer_list *t) if (!q->root_blkg) goto out_unlock; - if (throtl_can_upgrade(td, NULL)) - throtl_upgrade_state(td); - again: parent_sq = sq->parent_sq; dispatched = false; while (true) { + unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ); + unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE); + throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", - sq->nr_queued[READ] + sq->nr_queued[WRITE], - sq->nr_queued[READ], sq->nr_queued[WRITE]); + bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w); ret = throtl_select_dispatch(sq); if (ret) { @@ -1221,7 +1175,8 @@ again: if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ - if (tg->flags & THROTL_TG_WAS_EMPTY) { + if (tg->flags & THROTL_TG_WAS_EMPTY || + tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ @@ -1261,7 +1216,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) - while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) + while ((bio = throtl_pop_queued(td_sq, NULL, rw))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); @@ -1331,22 +1286,12 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); - struct throtl_grp *parent_tg; tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; - parent_tg = blkg_to_tg(blkg->parent); - /* - * make sure all children has lower idle time threshold and - * higher latency target - */ - this_tg->idletime_threshold = min(this_tg->idletime_threshold, - parent_tg->idletime_threshold); - this_tg->latency_target = max(this_tg->latency_target, - parent_tg->latency_target); } rcu_read_unlock(); @@ -1367,6 +1312,54 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) } } +static int blk_throtl_init(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct throtl_data *td; + unsigned int memflags; + int ret; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); + throtl_service_queue_init(&td->service_queue); + + /* + * Freeze queue before activating policy, to synchronize with IO path, + * which is protected by 'q_usage_counter'. + */ + memflags = blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); + + q->td = td; + td->queue = q; + + /* activate policy */ + ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); + if (ret) { + q->td = NULL; + kfree(td); + goto out; + } + + if (blk_queue_nonrot(q)) + td->throtl_slice = DFL_THROTL_SLICE_SSD; + else + td->throtl_slice = DFL_THROTL_SLICE_HD; + td->track_bio_latency = !queue_is_mq(q); + if (!td->track_bio_latency) + blk_stat_enable_accounting(q); + +out: + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue, memflags); + + return ret; +} + + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1378,6 +1371,16 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, blkg_conf_init(&ctx, buf); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out_finish; + + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { + ret = blk_throtl_init(ctx.bdev->bd_disk); + if (ret) + goto out_finish; + } + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; @@ -1444,25 +1447,25 @@ static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", - .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", - .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", - .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", - .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, @@ -1494,61 +1497,43 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); - char bufs[4][21] = { "max", "max", "max", "max" }; u64 bps_dft; unsigned int iops_dft; - char idle_time[26] = ""; - char latency_time[26] = ""; if (!dname) return 0; - if (off == LIMIT_LOW) { - bps_dft = 0; - iops_dft = 0; - } else { - bps_dft = U64_MAX; - iops_dft = UINT_MAX; - } + bps_dft = U64_MAX; + iops_dft = UINT_MAX; - if (tg->bps_conf[READ][off] == bps_dft && - tg->bps_conf[WRITE][off] == bps_dft && - tg->iops_conf[READ][off] == iops_dft && - tg->iops_conf[WRITE][off] == iops_dft && - (off != LIMIT_LOW || - (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && - tg->latency_target_conf == DFL_LATENCY_TARGET))) + if (tg->bps[READ] == bps_dft && + tg->bps[WRITE] == bps_dft && + tg->iops[READ] == iops_dft && + tg->iops[WRITE] == iops_dft) return 0; - if (tg->bps_conf[READ][off] != U64_MAX) - snprintf(bufs[0], sizeof(bufs[0]), "%llu", - tg->bps_conf[READ][off]); - if (tg->bps_conf[WRITE][off] != U64_MAX) - snprintf(bufs[1], sizeof(bufs[1]), "%llu", - tg->bps_conf[WRITE][off]); - if (tg->iops_conf[READ][off] != UINT_MAX) - snprintf(bufs[2], sizeof(bufs[2]), "%u", - tg->iops_conf[READ][off]); - if (tg->iops_conf[WRITE][off] != UINT_MAX) - snprintf(bufs[3], sizeof(bufs[3]), "%u", - tg->iops_conf[WRITE][off]); - if (off == LIMIT_LOW) { - if (tg->idletime_threshold_conf == ULONG_MAX) - strcpy(idle_time, " idle=max"); - else - snprintf(idle_time, sizeof(idle_time), " idle=%lu", - tg->idletime_threshold_conf); + seq_printf(sf, "%s", dname); + if (tg->bps[READ] == U64_MAX) + seq_printf(sf, " rbps=max"); + else + seq_printf(sf, " rbps=%llu", tg->bps[READ]); - if (tg->latency_target_conf == ULONG_MAX) - strcpy(latency_time, " latency=max"); - else - snprintf(latency_time, sizeof(latency_time), - " latency=%lu", tg->latency_target_conf); - } + if (tg->bps[WRITE] == U64_MAX) + seq_printf(sf, " wbps=max"); + else + seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); - seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", - dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, - latency_time); + if (tg->iops[READ] == UINT_MAX) + seq_printf(sf, " riops=max"); + else + seq_printf(sf, " riops=%u", tg->iops[READ]); + + if (tg->iops[WRITE] == UINT_MAX) + seq_printf(sf, " wiops=max"); + else + seq_printf(sf, " wiops=%u", tg->iops[WRITE]); + + seq_printf(sf, "\n"); return 0; } @@ -1566,13 +1551,20 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; - unsigned long idle_time; - unsigned long latency_time; int ret; - int index = of_cft(of)->private; blkg_conf_init(&ctx, buf); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out_finish; + + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { + ret = blk_throtl_init(ctx.bdev->bd_disk); + if (ret) + goto out_finish; + } + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; @@ -1580,13 +1572,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); - v[0] = tg->bps_conf[READ][index]; - v[1] = tg->bps_conf[WRITE][index]; - v[2] = tg->iops_conf[READ][index]; - v[3] = tg->iops_conf[WRITE][index]; + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; - idle_time = tg->idletime_threshold_conf; - latency_time = tg->latency_target_conf; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; @@ -1610,68 +1600,24 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, goto out_finish; ret = -EINVAL; - if (!strcmp(tok, "rbps") && val > 1) + if (!strcmp(tok, "rbps")) v[0] = val; - else if (!strcmp(tok, "wbps") && val > 1) + else if (!strcmp(tok, "wbps")) v[1] = val; - else if (!strcmp(tok, "riops") && val > 1) + else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); - else if (!strcmp(tok, "wiops") && val > 1) + else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); - else if (off == LIMIT_LOW && !strcmp(tok, "idle")) - idle_time = val; - else if (off == LIMIT_LOW && !strcmp(tok, "latency")) - latency_time = val; else goto out_finish; } - tg->bps_conf[READ][index] = v[0]; - tg->bps_conf[WRITE][index] = v[1]; - tg->iops_conf[READ][index] = v[2]; - tg->iops_conf[WRITE][index] = v[3]; + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; - if (index == LIMIT_MAX) { - tg->bps[READ][index] = v[0]; - tg->bps[WRITE][index] = v[1]; - tg->iops[READ][index] = v[2]; - tg->iops[WRITE][index] = v[3]; - } - tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], - tg->bps_conf[READ][LIMIT_MAX]); - tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], - tg->bps_conf[WRITE][LIMIT_MAX]); - tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], - tg->iops_conf[READ][LIMIT_MAX]); - tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], - tg->iops_conf[WRITE][LIMIT_MAX]); - tg->idletime_threshold_conf = idle_time; - tg->latency_target_conf = latency_time; - - /* force user to configure all settings for low limit */ - if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || - tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || - tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || - tg->latency_target_conf == DFL_LATENCY_TARGET) { - tg->bps[READ][LIMIT_LOW] = 0; - tg->bps[WRITE][LIMIT_LOW] = 0; - tg->iops[READ][LIMIT_LOW] = 0; - tg->iops[WRITE][LIMIT_LOW] = 0; - tg->idletime_threshold = DFL_IDLE_THRESHOLD; - tg->latency_target = DFL_LATENCY_TARGET; - } else if (index == LIMIT_LOW) { - tg->idletime_threshold = tg->idletime_threshold_conf; - tg->latency_target = tg->latency_target_conf; - } - - blk_throtl_update_limit_valid(tg->td); - if (tg->td->limit_valid[LIMIT_LOW]) { - if (index == LIMIT_LOW) - tg->td->limit_index = LIMIT_LOW; - } else - tg->td->limit_index = LIMIT_MAX; - tg_conf_updated(tg, index == LIMIT_LOW && - tg->td->limit_valid[LIMIT_LOW]); + tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); @@ -1679,21 +1625,11 @@ out_finish: } static struct cftype throtl_files[] = { -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - { - .name = "low", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = tg_print_limit, - .write = tg_set_limit, - .private = LIMIT_LOW, - }, -#endif { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, - .private = LIMIT_MAX, }, { } /* terminate */ }; @@ -1705,6 +1641,42 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_work_sync(&td->dispatch_work); } +static void tg_flush_bios(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + + if (tg->flags & THROTL_TG_CANCELING) + return; + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + tg->flags |= THROTL_TG_CANCELING; + + /* + * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup + * will be inserted to service queue without THROTL_TG_PENDING + * set in tg_update_disptime below. Then IO dispatched from + * child in tg_dispatch_one_bio will trigger double insertion + * and corrupt the tree. + */ + if (!(tg->flags & THROTL_TG_PENDING)) + return; + + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); +} + +static void throtl_pd_offline(struct blkg_policy_data *pd) +{ + tg_flush_bios(pd_to_tg(pd)); +} + struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, @@ -1722,6 +1694,9 @@ void blk_throtl_cancel_bios(struct gendisk *disk) struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; + if (!blk_throtl_activated(q)) + return; + spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. @@ -1730,449 +1705,48 @@ void blk_throtl_cancel_bios(struct gendisk *disk) */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - /* - * Set the flag to make sure throtl_pending_timer_fn() won't - * stop until all throttled bios are dispatched. - */ - tg->flags |= THROTL_TG_CANCELING; - - /* - * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup - * will be inserted to service queue without THROTL_TG_PENDING - * set in tg_update_disptime below. Then IO dispatched from - * child in tg_dispatch_one_bio will trigger double insertion - * and corrupt the tree. - */ - if (!(tg->flags & THROTL_TG_PENDING)) - continue; - /* - * Update disptime after setting the above flag to make sure - * throtl_select_dispatch() won't exit without dispatching. + * disk_release will call pd_offline_fn to cancel bios. + * However, disk_release can't be called if someone get + * the refcount of device and issued bios which are + * inflight after del_gendisk. + * Cancel bios here to ensure no bios are inflight after + * del_gendisk. */ - tg_update_disptime(tg); - - throtl_schedule_pending_timer(sq, jiffies + 1); + tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) -{ - unsigned long rtime = jiffies, wtime = jiffies; - - if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) - rtime = tg->last_low_overflow_time[READ]; - if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) - wtime = tg->last_low_overflow_time[WRITE]; - return min(rtime, wtime); -} - -static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) -{ - struct throtl_service_queue *parent_sq; - struct throtl_grp *parent = tg; - unsigned long ret = __tg_last_low_overflow_time(tg); - - while (true) { - parent_sq = parent->service_queue.parent_sq; - parent = sq_to_tg(parent_sq); - if (!parent) - break; - - /* - * The parent doesn't have low limit, it always reaches low - * limit. Its overflow time is useless for children - */ - if (!parent->bps[READ][LIMIT_LOW] && - !parent->iops[READ][LIMIT_LOW] && - !parent->bps[WRITE][LIMIT_LOW] && - !parent->iops[WRITE][LIMIT_LOW]) - continue; - if (time_after(__tg_last_low_overflow_time(parent), ret)) - ret = __tg_last_low_overflow_time(parent); - } - return ret; -} - -static bool throtl_tg_is_idle(struct throtl_grp *tg) -{ - /* - * cgroup is idle if: - * - single idle is too long, longer than a fixed value (in case user - * configure a too big threshold) or 4 times of idletime threshold - * - average think time is more than threshold - * - IO latency is largely below threshold - */ - unsigned long time; - bool ret; - - time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); - ret = tg->latency_target == DFL_LATENCY_TARGET || - tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (blk_time_get_ns() >> 10) - tg->last_finish_time > time || - tg->avg_idletime > tg->idletime_threshold || - (tg->latency_target && tg->bio_cnt && - tg->bad_bio_cnt * 5 < tg->bio_cnt); - throtl_log(&tg->service_queue, - "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", - tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, - tg->bio_cnt, ret, tg->td->scale); - return ret; -} - -static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) +static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; - bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; /* - * if low limit is zero, low limit is always reached. - * if low limit is non-zero, we can check if there is any request - * is queued to determine if low limit is reached as we throttle - * request according to limit. + * For a split bio, we need to specifically distinguish whether the + * iops queue is empty. */ - return !limit || sq->nr_queued[rw]; -} + if (bio_flagged(bio, BIO_BPS_THROTTLED)) + return sq->nr_queued_iops[rw] == 0 && + tg_dispatch_iops_time(tg, bio) == 0; -static bool throtl_tg_can_upgrade(struct throtl_grp *tg) -{ /* - * cgroup reaches low limit when low limit of READ and WRITE are - * both reached, it's ok to upgrade to next limit if cgroup reaches - * low limit + * Throtl is FIFO - if bios are already queued, should queue. + * If the bps queue is empty and @bio is within the bps limit, charge + * bps here for direct placement into the iops queue. */ - if (throtl_low_limit_reached(tg, READ) && - throtl_low_limit_reached(tg, WRITE)) - return true; - - if (time_after_eq(jiffies, - tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && - throtl_tg_is_idle(tg)) - return true; - return false; -} + if (sq_queued(&tg->service_queue, rw)) { + if (sq->nr_queued_bps[rw] == 0 && + tg_dispatch_bps_time(tg, bio) == 0) + throtl_charge_bps_bio(tg, bio); -static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) -{ - while (true) { - if (throtl_tg_can_upgrade(tg)) - return true; - tg = sq_to_tg(tg->service_queue.parent_sq); - if (!tg || !tg_to_blkg(tg)->parent) - return false; - } - return false; -} - -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - if (td->limit_index != LIMIT_LOW) - return false; - - if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) return false; - - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - if (tg == this_tg) - continue; - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) - continue; - if (!throtl_hierarchy_can_upgrade(tg)) { - rcu_read_unlock(); - return false; - } - } - rcu_read_unlock(); - return true; -} - -static void throtl_upgrade_check(struct throtl_grp *tg) -{ - unsigned long now = jiffies; - - if (tg->td->limit_index != LIMIT_LOW) - return; - - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) - return; - - tg->last_check_time = now; - - if (!time_after_eq(now, - __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) - return; - - if (throtl_can_upgrade(tg->td, NULL)) - throtl_upgrade_state(tg->td); -} - -static void throtl_upgrade_state(struct throtl_data *td) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - throtl_log(&td->service_queue, "upgrade to max"); - td->limit_index = LIMIT_MAX; - td->low_upgrade_time = jiffies; - td->scale = 0; - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - tg->disptime = jiffies - 1; - throtl_select_dispatch(sq); - throtl_schedule_next_dispatch(sq, true); - } - rcu_read_unlock(); - throtl_select_dispatch(&td->service_queue); - throtl_schedule_next_dispatch(&td->service_queue, true); - queue_work(kthrotld_workqueue, &td->dispatch_work); -} - -static void throtl_downgrade_state(struct throtl_data *td) -{ - td->scale /= 2; - - throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); - if (td->scale) { - td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; - return; - } - - td->limit_index = LIMIT_LOW; - td->low_downgrade_time = jiffies; -} - -static bool throtl_tg_can_downgrade(struct throtl_grp *tg) -{ - struct throtl_data *td = tg->td; - unsigned long now = jiffies; - - /* - * If cgroup is below low limit, consider downgrade and throttle other - * cgroups - */ - if (time_after_eq(now, tg_last_low_overflow_time(tg) + - td->throtl_slice) && - (!throtl_tg_is_idle(tg) || - !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) - return true; - return false; -} - -static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) -{ - struct throtl_data *td = tg->td; - - if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) - return false; - - while (true) { - if (!throtl_tg_can_downgrade(tg)) - return false; - tg = sq_to_tg(tg->service_queue.parent_sq); - if (!tg || !tg_to_blkg(tg)->parent) - break; - } - return true; -} - -static void throtl_downgrade_check(struct throtl_grp *tg) -{ - uint64_t bps; - unsigned int iops; - unsigned long elapsed_time; - unsigned long now = jiffies; - - if (tg->td->limit_index != LIMIT_MAX || - !tg->td->limit_valid[LIMIT_LOW]) - return; - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) - return; - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) - return; - - elapsed_time = now - tg->last_check_time; - tg->last_check_time = now; - - if (time_before(now, tg_last_low_overflow_time(tg) + - tg->td->throtl_slice)) - return; - - if (tg->bps[READ][LIMIT_LOW]) { - bps = tg->last_bytes_disp[READ] * HZ; - do_div(bps, elapsed_time); - if (bps >= tg->bps[READ][LIMIT_LOW]) - tg->last_low_overflow_time[READ] = now; - } - - if (tg->bps[WRITE][LIMIT_LOW]) { - bps = tg->last_bytes_disp[WRITE] * HZ; - do_div(bps, elapsed_time); - if (bps >= tg->bps[WRITE][LIMIT_LOW]) - tg->last_low_overflow_time[WRITE] = now; - } - - if (tg->iops[READ][LIMIT_LOW]) { - iops = tg->last_io_disp[READ] * HZ / elapsed_time; - if (iops >= tg->iops[READ][LIMIT_LOW]) - tg->last_low_overflow_time[READ] = now; - } - - if (tg->iops[WRITE][LIMIT_LOW]) { - iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; - if (iops >= tg->iops[WRITE][LIMIT_LOW]) - tg->last_low_overflow_time[WRITE] = now; - } - - /* - * If cgroup is below low limit, consider downgrade and throttle other - * cgroups - */ - if (throtl_hierarchy_can_downgrade(tg)) - throtl_downgrade_state(tg->td); - - tg->last_bytes_disp[READ] = 0; - tg->last_bytes_disp[WRITE] = 0; - tg->last_io_disp[READ] = 0; - tg->last_io_disp[WRITE] = 0; -} - -static void blk_throtl_update_idletime(struct throtl_grp *tg) -{ - unsigned long now; - unsigned long last_finish_time = tg->last_finish_time; - - if (last_finish_time == 0) - return; - - now = blk_time_get_ns() >> 10; - if (now <= last_finish_time || - last_finish_time == tg->checked_last_finish_time) - return; - - tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; - tg->checked_last_finish_time = last_finish_time; -} - -static void throtl_update_latency_buckets(struct throtl_data *td) -{ - struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; - int i, cpu, rw; - unsigned long last_latency[2] = { 0 }; - unsigned long latency[2]; - - if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) - return; - if (time_before(jiffies, td->last_calculate_time + HZ)) - return; - td->last_calculate_time = jiffies; - - memset(avg_latency, 0, sizeof(avg_latency)); - for (rw = READ; rw <= WRITE; rw++) { - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; - - for_each_possible_cpu(cpu) { - struct latency_bucket *bucket; - - /* this isn't race free, but ok in practice */ - bucket = per_cpu_ptr(td->latency_buckets[rw], - cpu); - tmp->total_latency += bucket[i].total_latency; - tmp->samples += bucket[i].samples; - bucket[i].total_latency = 0; - bucket[i].samples = 0; - } - - if (tmp->samples >= 32) { - int samples = tmp->samples; - - latency[rw] = tmp->total_latency; - - tmp->total_latency = 0; - tmp->samples = 0; - latency[rw] /= samples; - if (latency[rw] == 0) - continue; - avg_latency[rw][i].latency = latency[rw]; - } - } } - for (rw = READ; rw <= WRITE; rw++) { - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - if (!avg_latency[rw][i].latency) { - if (td->avg_buckets[rw][i].latency < last_latency[rw]) - td->avg_buckets[rw][i].latency = - last_latency[rw]; - continue; - } - - if (!td->avg_buckets[rw][i].valid) - latency[rw] = avg_latency[rw][i].latency; - else - latency[rw] = (td->avg_buckets[rw][i].latency * 7 + - avg_latency[rw][i].latency) >> 3; - - td->avg_buckets[rw][i].latency = max(latency[rw], - last_latency[rw]); - td->avg_buckets[rw][i].valid = true; - last_latency[rw] = td->avg_buckets[rw][i].latency; - } - } - - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) - throtl_log(&td->service_queue, - "Latency bucket %d: read latency=%ld, read valid=%d, " - "write latency=%ld, write valid=%d", i, - td->avg_buckets[READ][i].latency, - td->avg_buckets[READ][i].valid, - td->avg_buckets[WRITE][i].latency, - td->avg_buckets[WRITE][i].valid); -} -#else -static inline void throtl_update_latency_buckets(struct throtl_data *td) -{ -} - -static void blk_throtl_update_idletime(struct throtl_grp *tg) -{ + return tg_dispatch_time(tg, bio) == 0; } -static void throtl_downgrade_check(struct throtl_grp *tg) -{ -} - -static void throtl_upgrade_check(struct throtl_grp *tg) -{ -} - -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg) -{ - return false; -} - -static void throtl_upgrade_state(struct throtl_data *td) -{ -} -#endif - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -2185,51 +1759,42 @@ bool __blk_throtl_bio(struct bio *bio) struct throtl_data *td = tg->td; rcu_read_lock(); - spin_lock_irq(&q->queue_lock); - - throtl_update_latency_buckets(td); - - blk_throtl_update_idletime(tg); - sq = &tg->service_queue; -again: while (true) { - if (tg->last_low_overflow_time[rw] == 0) - tg->last_low_overflow_time[rw] = jiffies; - throtl_downgrade_check(tg); - throtl_upgrade_check(tg); - /* throtl is FIFO - if bios are already queued, should queue */ - if (sq->nr_queued[rw]) - break; - - /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) { - tg->last_low_overflow_time[rw] = jiffies; - if (throtl_can_upgrade(td, tg)) { - throtl_upgrade_state(td); - goto again; - } + if (tg_within_limit(tg, bio, rw)) { + /* within limits, let's charge and dispatch directly */ + throtl_charge_iops_bio(tg, bio); + + /* + * We need to trim slice even when bios are not being + * queued otherwise it might happen that a bio is not + * queued for a long time and slice keeps on extending + * and trim is not called for a long time. Now if limits + * are reduced suddenly we take into account all the IO + * dispatched so far at new low rate and * newly queued + * IO gets a really long dispatch time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(tg, rw); + } else if (bio_issue_as_root_blkg(bio)) { + /* + * IOs which may cause priority inversions are + * dispatched directly, even if they're over limit. + * + * Charge and dispatch directly, and our throttle + * control algorithm is adaptive, and extra IO bytes + * will be throttled for paying the debt + */ + throtl_charge_bps_bio(tg, bio); + throtl_charge_iops_bio(tg, bio); + } else { + /* if above limits, break to queue */ break; } - /* within limits, let's charge and dispatch directly */ - throtl_charge_bio(tg, bio); - - /* - * We need to trim slice even when bios are not being queued - * otherwise it might happen that a bio is not queued for - * a long time and slice keeps on extending and trim is not - * called for a long time. Now if limits are reduced suddenly - * we take into account all the IO dispatched so far at new - * low rate and * newly queued IO gets a really long dispatch - * time. - * - * So keep on trimming slice even if bio is not queued. - */ - throtl_trim_slice(tg, rw); - /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it @@ -2250,9 +1815,7 @@ again: tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), - sq->nr_queued[READ], sq->nr_queued[WRITE]); - - tg->last_low_overflow_time[rw] = jiffies; + sq_queued(sq, READ), sq_queued(sq, WRITE)); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); @@ -2260,225 +1823,37 @@ again: /* * Update @tg's dispatch time and force schedule dispatch if @tg - * was empty before @bio. The forced scheduling isn't likely to - * cause undue delay as @bio is likely to be dispatched directly if - * its @tg's disptime is not in the future. + * was empty before @bio, or the iops queue is empty and @bio will + * add to. The forced scheduling isn't likely to cause undue + * delay as @bio is likely to be dispatched directly if its @tg's + * disptime is not in the future. */ - if (tg->flags & THROTL_TG_WAS_EMPTY) { + if (tg->flags & THROTL_TG_WAS_EMPTY || + tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } out_unlock: -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (throttled || !td->track_bio_latency) - bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; -#endif spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static void throtl_track_latency(struct throtl_data *td, sector_t size, - enum req_op op, unsigned long time) -{ - const bool rw = op_is_write(op); - struct latency_bucket *latency; - int index; - - if (!td || td->limit_index != LIMIT_LOW || - !(op == REQ_OP_READ || op == REQ_OP_WRITE) || - !blk_queue_nonrot(td->queue)) - return; - - index = request_bucket_index(size); - - latency = get_cpu_ptr(td->latency_buckets[rw]); - latency[index].total_latency += time; - latency[index].samples++; - put_cpu_ptr(td->latency_buckets[rw]); -} - -void blk_throtl_stat_add(struct request *rq, u64 time_ns) -{ - struct request_queue *q = rq->q; - struct throtl_data *td = q->td; - - throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), - time_ns >> 10); -} - -void blk_throtl_bio_endio(struct bio *bio) -{ - struct blkcg_gq *blkg; - struct throtl_grp *tg; - u64 finish_time_ns; - unsigned long finish_time; - unsigned long start_time; - unsigned long lat; - int rw = bio_data_dir(bio); - - blkg = bio->bi_blkg; - if (!blkg) - return; - tg = blkg_to_tg(blkg); - if (!tg->td->limit_valid[LIMIT_LOW]) - return; - - finish_time_ns = blk_time_get_ns(); - tg->last_finish_time = finish_time_ns >> 10; - - start_time = bio_issue_time(&bio->bi_issue) >> 10; - finish_time = __bio_issue_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) - return; - - lat = finish_time - start_time; - /* this is only for bio based driver */ - if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY)) - throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue), - bio_op(bio), lat); - - if (tg->latency_target && lat >= tg->td->filtered_latency) { - int bucket; - unsigned int threshold; - - bucket = request_bucket_index(bio_issue_size(&bio->bi_issue)); - threshold = tg->td->avg_buckets[rw][bucket].latency + - tg->latency_target; - if (lat > threshold) - tg->bad_bio_cnt++; - /* - * Not race free, could get wrong count, which means cgroups - * will be throttled - */ - tg->bio_cnt++; - } - - if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { - tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; - tg->bio_cnt /= 2; - tg->bad_bio_cnt /= 2; - } -} -#endif - -int blk_throtl_init(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - struct throtl_data *td; - int ret; - - td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); - if (!td) - return -ENOMEM; - td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * - LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets[READ]) { - kfree(td); - return -ENOMEM; - } - td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * - LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets[WRITE]) { - free_percpu(td->latency_buckets[READ]); - kfree(td); - return -ENOMEM; - } - - INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue); - - q->td = td; - td->queue = q; - - td->limit_valid[LIMIT_MAX] = true; - td->limit_index = LIMIT_MAX; - td->low_upgrade_time = jiffies; - td->low_downgrade_time = jiffies; - - /* activate policy */ - ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); - if (ret) { - free_percpu(td->latency_buckets[READ]); - free_percpu(td->latency_buckets[WRITE]); - kfree(td); - } - return ret; -} - void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; - BUG_ON(!q->td); - del_timer_sync(&q->td->service_queue.pending_timer); + if (!blk_throtl_activated(q)) + return; + + timer_delete_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); - free_percpu(q->td->latency_buckets[READ]); - free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } -void blk_throtl_register(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - struct throtl_data *td; - int i; - - td = q->td; - BUG_ON(!td); - - if (blk_queue_nonrot(q)) { - td->throtl_slice = DFL_THROTL_SLICE_SSD; - td->filtered_latency = LATENCY_FILTERED_SSD; - } else { - td->throtl_slice = DFL_THROTL_SLICE_HD; - td->filtered_latency = LATENCY_FILTERED_HD; - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; - td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; - } - } -#ifndef CONFIG_BLK_DEV_THROTTLING_LOW - /* if no low limit, use previous default */ - td->throtl_slice = DFL_THROTL_SLICE_HD; - -#else - td->track_bio_latency = !queue_is_mq(q); - if (!td->track_bio_latency) - blk_stat_enable_accounting(q); -#endif -} - -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) -{ - if (!q->td) - return -EINVAL; - return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); -} - -ssize_t blk_throtl_sample_time_store(struct request_queue *q, - const char *page, size_t count) -{ - unsigned long v; - unsigned long t; - - if (!q->td) - return -EINVAL; - if (kstrtoul(page, 10, &v)) - return -EINVAL; - t = msecs_to_jiffies(v); - if (t == 0 || t > MAX_THROTL_SLICE) - return -EINVAL; - q->td->throtl_slice = t; - return count; -} -#endif - static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index bffbc9cfc8ab..3b27755bfbff 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_THROTTLE_H #define BLK_THROTTLE_H @@ -28,7 +29,8 @@ */ struct throtl_qnode { struct list_head node; /* service_queue->queued[] */ - struct bio_list bios; /* queued bios */ + struct bio_list bios_bps; /* queued bios for bps limit */ + struct bio_list bios_iops; /* queued bios for iops limit */ struct throtl_grp *tg; /* tg this qnode belongs to */ }; @@ -40,7 +42,8 @@ struct throtl_service_queue { * children throtl_grp's. */ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ - unsigned int nr_queued[2]; /* number of queued bios */ + unsigned int nr_queued_bps[2]; /* number of queued bps bios */ + unsigned int nr_queued_iops[2]; /* number of queued iops bios */ /* * RB tree of active children throtl_grp's, which are sorted by @@ -53,15 +56,14 @@ struct throtl_service_queue { }; enum tg_state_flags { - THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ - THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ - THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ -}; - -enum { - LIMIT_LOW, - LIMIT_MAX, - LIMIT_CNT, + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ + /* + * The sq's iops queue is empty, and a bio is about to be enqueued + * to the first qnode's bios_iops list. + */ + THROTL_TG_IOPS_WAS_EMPTY = 1 << 2, + THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */ }; struct throtl_grp { @@ -101,53 +103,29 @@ struct throtl_grp { bool has_rules_bps[2]; bool has_rules_iops[2]; - /* internally used bytes per second rate limits */ - uint64_t bps[2][LIMIT_CNT]; - /* user configured bps limits */ - uint64_t bps_conf[2][LIMIT_CNT]; + /* bytes per second rate limits */ + uint64_t bps[2]; - /* internally used IOPS limits */ - unsigned int iops[2][LIMIT_CNT]; - /* user configured IOPS limits */ - unsigned int iops_conf[2][LIMIT_CNT]; - - /* Number of bytes dispatched in current slice */ - uint64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; - - unsigned long last_low_overflow_time[2]; - - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; + /* IOPS limits */ + unsigned int iops[2]; /* - * The following two fields are updated when new configuration is - * submitted while some bios are still throttled, they record how many - * bytes/ios are waited already in previous configuration, and they will - * be used to calculate wait time under new configuration. + * Number of bytes/bio's dispatched in current slice. + * When new configuration is submitted while some bios are still throttled, + * first calculate the carryover: the amount of bytes/IOs already waited + * under the previous configuration. Then, [bytes/io]_disp are represented + * as the negative of the carryover, and they will be used to calculate the + * wait time under the new configuration. */ - long long carryover_bytes[2]; - int carryover_ios[2]; + int64_t bytes_disp[2]; + int io_disp[2]; unsigned long last_check_time; - unsigned long latency_target; /* us */ - unsigned long latency_target_conf; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - unsigned long last_finish_time; /* ns / 1024 */ - unsigned long checked_last_finish_time; /* ns / 1024 */ - unsigned long avg_idletime; /* ns / 1024 */ - unsigned long idletime_threshold; /* us */ - unsigned long idletime_threshold_conf; /* us */ - - unsigned int bio_cnt; /* total bios */ - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ - unsigned long bio_cnt_reset_time; - struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -168,23 +146,33 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING -static inline int blk_throtl_init(struct gendisk *disk) { return 0; } static inline void blk_throtl_exit(struct gendisk *disk) { } -static inline void blk_throtl_register(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ -int blk_throtl_init(struct gendisk *disk); void blk_throtl_exit(struct gendisk *disk); -void blk_throtl_register(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct gendisk *disk); +static inline bool blk_throtl_activated(struct request_queue *q) +{ + return q->td != NULL; +} + static inline bool blk_should_throtl(struct bio *bio) { - struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + struct throtl_grp *tg; int rw = bio_data_dir(bio); + /* + * This is called under bio_queue_enter(), and it's synchronized with + * the activation of blk-throtl, which is protected by + * blk_mq_freeze_queue(). + */ + if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) + return false; + + tg = blkg_to_tg(bio->bi_blkg); if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 64472134dd26..a50d4cd55f41 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -37,7 +37,7 @@ enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_SWAP = 4, /* write, from swap_writeout() */ WBT_DISCARD = 8, /* discard */ WBT_NR_BITS = 4, /* number of bits */ @@ -45,7 +45,7 @@ enum wbt_flags { enum { WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, + WBT_RWQ_SWAP, WBT_RWQ_DISCARD, WBT_NUM_RWQ, }; @@ -136,8 +136,9 @@ enum { RWB_MIN_WRITE_SAMPLES = 3, /* - * If we have this number of consecutive windows with not enough - * information to scale up or down, scale up. + * If we have this number of consecutive windows without enough + * information to scale up or down, slowly return to center state + * (step == 0). */ RWB_UNKNOWN_BUMP = 5, }; @@ -172,8 +173,8 @@ static bool wb_recent_wait(struct rq_wb *rwb) static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, enum wbt_flags wb_acct) { - if (wb_acct & WBT_KSWAPD) - return &rwb->rq_wait[WBT_RWQ_KSWAPD]; + if (wb_acct & WBT_SWAP) + return &rwb->rq_wait[WBT_RWQ_SWAP]; else if (wb_acct & WBT_DISCARD) return &rwb->rq_wait[WBT_RWQ_DISCARD]; @@ -206,8 +207,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) && - !wb_recent_wait(rwb)) + else if (blk_queue_write_cache(rwb->rqos.disk->queue) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -446,9 +447,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) break; case LAT_UNKNOWN_WRITES: /* - * We started a the center step, but don't have a valid - * read/write sample, but we do have writes going on. - * Allow step to go negative, to increase write perf. + * We don't have a valid read/write sample, but we do have + * writes going on. Allow step to go negative, to increase + * write performance. */ scale_up(rwb); break; @@ -528,7 +529,7 @@ static bool close_io(struct rq_wb *rwb) time_before(now, rwb->last_comp + HZ / 10); } -#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP) static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { @@ -539,13 +540,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) /* * At this point we know it's a buffered write. If this is - * kswapd trying to free memory, or REQ_SYNC is set, then + * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb)) limit = rwb->rq_depth.max_depth; else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* @@ -622,8 +623,8 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; } else if (wbt_should_throttle(bio)) { - if (current_is_kswapd()) - flags |= WBT_KSWAPD; + if (bio->bi_opf & REQ_SWAP) + flags |= WBT_SWAP; if (bio_op(bio) == REQ_OP_DISCARD) flags |= WBT_DISCARD; flags |= WBT_TRACKED; @@ -638,11 +639,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) __wbt_done(rqos, flags); } -/* - * May sleep, if we have exceeded the writeback limits. Caller can pass - * in an irq held spinlock, if it holds one when calling this function. - * If we do sleep, we'll release and re-grab it. - */ +/* May sleep, if we have exceeded the writeback limits. */ static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); @@ -707,8 +704,9 @@ void wbt_enable_default(struct gendisk *disk) struct rq_qos *rqos; bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); - if (q->elevator && - test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) + mutex_lock(&disk->rqos_state_mutex); + + if (blk_queue_disable_wbt(q)) enable = false; /* Throttling already enabled? */ @@ -716,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk) if (rqos) { if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; + mutex_unlock(&disk->rqos_state_mutex); return; } + mutex_unlock(&disk->rqos_state_mutex); /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) @@ -777,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk) struct rq_wb *rwb; if (!rqos) return; + mutex_lock(&disk->rqos_state_mutex); rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); rwb->enable_state = WBT_STATE_OFF_DEFAULT; } + mutex_unlock(&disk->rqos_state_mutex); } EXPORT_SYMBOL_GPL(wbt_disable_default); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index da0f4b2a8fa0..8f15d1aa6eb8 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -7,17 +7,19 @@ * * Copyright (c) 2016, Damien Le Moal * Copyright (c) 2016, Western Digital + * Copyright (c) 2024, Western Digital Corporation or its affiliates. */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <linux/sched/mm.h> +#include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/mempool.h> #include "blk.h" +#include "blk-mq-sched.h" +#include "blk-mq-debugfs.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { @@ -32,6 +34,60 @@ static const char *const zone_cond_name[] = { }; #undef ZONE_COND_NAME +/* + * Per-zone write plug. + * @node: hlist_node structure for managing the plug using a hash table. + * @ref: Zone write plug reference counter. A zone write plug reference is + * always at least 1 when the plug is hashed in the disk plug hash table. + * The reference is incremented whenever a new BIO needing plugging is + * submitted and when a function needs to manipulate a plug. The + * reference count is decremented whenever a plugged BIO completes and + * when a function that referenced the plug returns. The initial + * reference is dropped whenever the zone of the zone write plug is reset, + * finished and when the zone becomes full (last write BIO to the zone + * completes). + * @lock: Spinlock to atomically manipulate the plug. + * @flags: Flags indicating the plug state. + * @zone_no: The number of the zone the plug is managing. + * @wp_offset: The zone write pointer location relative to the start of the zone + * as a number of 512B sectors. + * @bio_list: The list of BIOs that are currently plugged. + * @bio_work: Work struct to handle issuing of plugged BIOs + * @rcu_head: RCU head to free zone write plugs with an RCU grace period. + * @disk: The gendisk the plug belongs to. + */ +struct blk_zone_wplug { + struct hlist_node node; + refcount_t ref; + spinlock_t lock; + unsigned int flags; + unsigned int zone_no; + unsigned int wp_offset; + struct bio_list bio_list; + struct work_struct bio_work; + struct rcu_head rcu_head; + struct gendisk *disk; +}; + +/* + * Zone write plug flags bits: + * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, + * that is, that write BIOs are being throttled due to a write BIO already + * being executed or the zone write plug bio list is not empty. + * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone + * write pointer offset and need to update it. + * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed + * from the disk hash table and that the initial reference to the zone + * write plug set when the plug was first added to the hash table has been + * dropped. This flag is set when a zone is reset, finished or become full, + * to prevent new references to the zone write plug to be taken for + * newly incoming BIOs. A zone write plug flagged with this flag will be + * freed once all remaining references from BIOs or functions are dropped. + */ +#define BLK_ZONE_WPLUG_PLUGGED (1U << 0) +#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) +#define BLK_ZONE_WPLUG_UNHASHED (1U << 2) + /** * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. * @zone_cond: BLK_ZONE_COND_XXX. @@ -51,69 +107,29 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -/* - * Return true if a request is a write requests that needs zone write locking. - */ -bool blk_req_needs_zone_write_lock(struct request *rq) -{ - if (!rq->q->disk->seq_zones_wlock) - return false; - - return blk_rq_is_seq_zoned_write(rq); -} -EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); - -bool blk_req_zone_write_trylock(struct request *rq) -{ - unsigned int zno = blk_rq_zone_no(rq); - - if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) - return false; - - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; - - return true; -} -EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); - -void __blk_req_zone_write_lock(struct request *rq) -{ - if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), - rq->q->disk->seq_zones_wlock))) - return; +struct disk_report_zones_cb_args { + struct gendisk *disk; + report_zones_cb user_cb; + void *user_data; +}; - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; -} -EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); +static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone); -void __blk_req_zone_write_unlock(struct request *rq) +static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, + void *data) { - rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; - if (rq->q->disk->seq_zones_wlock) - WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), - rq->q->disk->seq_zones_wlock)); -} -EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); + struct disk_report_zones_cb_args *args = data; + struct gendisk *disk = args->disk; -/** - * bdev_nr_zones - Get number of zones - * @bdev: Target device - * - * Return the total number of zones of a zoned block device. For a block - * device without zone capabilities, the number of zones is always 0. - */ -unsigned int bdev_nr_zones(struct block_device *bdev) -{ - sector_t zone_sectors = bdev_zone_sectors(bdev); + if (disk->zone_wplugs_hash) + disk_zone_wplug_sync_wp_offset(disk, zone); - if (!bdev_is_zoned(bdev)) + if (!args->user_cb) return 0; - return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> - ilog2(zone_sectors); + + return args->user_cb(zone, idx, args->user_data); } -EXPORT_SYMBOL_GPL(bdev_nr_zones); /** * blkdev_report_zones - Get zones information @@ -139,6 +155,11 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, { struct gendisk *disk = bdev->bd_disk; sector_t capacity = get_capacity(disk); + struct disk_report_zones_cb_args args = { + .disk = disk, + .user_cb = cb, + .user_data = data, + }; if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; @@ -146,81 +167,11 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, if (!nr_zones || sector >= capacity) return 0; - return disk->fops->report_zones(disk, sector, nr_zones, cb, data); + return disk->fops->report_zones(disk, sector, nr_zones, + disk_report_zones_cb, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); -static inline unsigned long *blk_alloc_zone_bitmap(int node, - unsigned int nr_zones) -{ - return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), - GFP_NOIO, node); -} - -static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - /* - * For an all-zones reset, ignore conventional, empty, read-only - * and offline zones. - */ - switch (zone->cond) { - case BLK_ZONE_COND_NOT_WP: - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_READONLY: - case BLK_ZONE_COND_OFFLINE: - return 0; - default: - set_bit(idx, (unsigned long *)data); - return 0; - } -} - -static int blkdev_zone_reset_all_emulated(struct block_device *bdev) -{ - struct gendisk *disk = bdev->bd_disk; - sector_t capacity = bdev_nr_sectors(bdev); - sector_t zone_sectors = bdev_zone_sectors(bdev); - unsigned long *need_reset; - struct bio *bio = NULL; - sector_t sector = 0; - int ret; - - need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); - if (!need_reset) - return -ENOMEM; - - ret = disk->fops->report_zones(disk, 0, disk->nr_zones, - blk_zone_need_reset_cb, need_reset); - if (ret < 0) - goto out_free_need_reset; - - ret = 0; - while (sector < capacity) { - if (!test_bit(disk_zone_no(disk, sector), need_reset)) { - sector += zone_sectors; - continue; - } - - bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, - GFP_KERNEL); - bio->bi_iter.bi_sector = sector; - sector += zone_sectors; - - /* This may take a while, so be nice to others */ - cond_resched(); - } - - if (bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - -out_free_need_reset: - kfree(need_reset); - return ret; -} - static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -247,7 +198,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev) int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sector, sector_t nr_sectors) { - struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); sector_t capacity = bdev_nr_sectors(bdev); sector_t end_sector = sector + nr_sectors; @@ -275,16 +225,11 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, return -EINVAL; /* - * In the case of a zone reset operation over all zones, - * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this - * command. For other devices, we emulate this command behavior by - * identifying the zones needing a reset. + * In the case of a zone reset operation over all zones, use + * REQ_OP_ZONE_RESET_ALL. */ - if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { - if (!blk_queue_zone_resetall(q)) - return blkdev_zone_reset_all_emulated(bdev); + if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) return blkdev_zone_reset_all(bdev); - } while (sector < end_sector) { bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); @@ -398,7 +343,8 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, op = REQ_OP_ZONE_RESET; /* Invalidate the page cache, including dirty pages. */ - filemap_invalidate_lock(bdev->bd_inode->i_mapping); + inode_lock(bdev->bd_mapping->host); + filemap_invalidate_lock(bdev->bd_mapping); ret = blkdev_truncate_zone_range(bdev, mode, &zrange); if (ret) goto fail; @@ -419,29 +365,1279 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: - if (cmd == BLKRESETZONE) - filemap_invalidate_unlock(bdev->bd_inode->i_mapping); + if (cmd == BLKRESETZONE) { + filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); + } return ret; } -void disk_free_zone_bitmaps(struct gendisk *disk) +static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) +{ + return zone->start + zone->len >= get_capacity(disk); +} + +static bool disk_zone_is_full(struct gendisk *disk, + unsigned int zno, unsigned int offset_in_zone) +{ + if (zno < disk->nr_zones - 1) + return offset_in_zone >= disk->zone_capacity; + return offset_in_zone >= disk->last_zone_capacity; +} + +static bool disk_zone_wplug_is_full(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); +} + +static bool disk_insert_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + struct blk_zone_wplug *zwplg; + unsigned long flags; + unsigned int idx = + hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); + + /* + * Add the new zone write plug to the hash table, but carefully as we + * are racing with other submission context, so we may already have a + * zone write plug for the same zone. + */ + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { + if (zwplg->zone_no == zwplug->zone_no) { + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + return false; + } + } + hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + atomic_inc(&disk->nr_zone_wplugs); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + return true; +} + +static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + unsigned int zno = disk_zone_no(disk, sector); + unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); + struct blk_zone_wplug *zwplug; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { + if (zwplug->zone_no == zno && + refcount_inc_not_zero(&zwplug->ref)) { + rcu_read_unlock(); + return zwplug; + } + } + + rcu_read_unlock(); + + return NULL; +} + +static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + if (!atomic_read(&disk->nr_zone_wplugs)) + return NULL; + + return disk_get_hashed_zone_wplug(disk, sector); +} + +static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) +{ + struct blk_zone_wplug *zwplug = + container_of(rcu_head, struct blk_zone_wplug, rcu_head); + + mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); +} + +static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) +{ + if (refcount_dec_and_test(&zwplug->ref)) { + WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); + WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); + + call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); + } +} + +static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + lockdep_assert_held(&zwplug->lock); + + /* If the zone write plug was already removed, we are done. */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) + return false; + + /* If the zone write plug is still plugged, it cannot be removed. */ + if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) + return false; + + /* + * Completions of BIOs with blk_zone_write_plug_bio_endio() may + * happen after handling a request completion with + * blk_zone_write_plug_finish_request() (e.g. with split BIOs + * that are chained). In such case, disk_zone_wplug_unplug_bio() + * should not attempt to remove the zone write plug until all BIO + * completions are seen. Check by looking at the zone write plug + * reference count, which is 2 when the plug is unused (one reference + * taken when the plug was allocated and another reference taken by the + * caller context). + */ + if (refcount_read(&zwplug->ref) > 2) + return false; + + /* We can remove zone write plugs for zones that are empty or full. */ + return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); +} + +static void disk_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + /* If the zone write plug was already removed, we have nothing to do. */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) + return; + + /* + * Mark the zone write plug as unhashed and drop the extra reference we + * took when the plug was inserted in the hash table. + */ + zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + hlist_del_init_rcu(&zwplug->node); + atomic_dec(&disk->nr_zone_wplugs); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + disk_put_zone_wplug(zwplug); +} + +static void blk_zone_wplug_bio_work(struct work_struct *work); + +/* + * Get a reference on the write plug for the zone containing @sector. + * If the plug does not exist, it is allocated and hashed. + * Return a pointer to the zone write plug with the plug spinlock held. + */ +static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, + sector_t sector, gfp_t gfp_mask, + unsigned long *flags) +{ + unsigned int zno = disk_zone_no(disk, sector); + struct blk_zone_wplug *zwplug; + +again: + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + /* + * Check that a BIO completion or a zone reset or finish + * operation has not already removed the zone write plug from + * the hash table and dropped its reference count. In such case, + * we need to get a new plug so start over from the beginning. + */ + spin_lock_irqsave(&zwplug->lock, *flags); + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { + spin_unlock_irqrestore(&zwplug->lock, *flags); + disk_put_zone_wplug(zwplug); + goto again; + } + return zwplug; + } + + /* + * Allocate and initialize a zone write plug with an extra reference + * so that it is not freed when the zone write plug becomes idle without + * the zone being full. + */ + zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); + if (!zwplug) + return NULL; + + INIT_HLIST_NODE(&zwplug->node); + refcount_set(&zwplug->ref, 2); + spin_lock_init(&zwplug->lock); + zwplug->flags = 0; + zwplug->zone_no = zno; + zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); + bio_list_init(&zwplug->bio_list); + INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + zwplug->disk = disk; + + spin_lock_irqsave(&zwplug->lock, *flags); + + /* + * Insert the new zone write plug in the hash table. This can fail only + * if another context already inserted a plug. Retry from the beginning + * in such case. + */ + if (!disk_insert_zone_wplug(disk, zwplug)) { + spin_unlock_irqrestore(&zwplug->lock, *flags); + mempool_free(zwplug, disk->zone_wplugs_pool); + goto again; + } + + return zwplug; +} + +static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, + struct bio *bio) +{ + struct request_queue *q = zwplug->disk->queue; + + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); + bio_io_error(bio); + disk_put_zone_wplug(zwplug); + /* Drop the reference taken by disk_zone_wplug_add_bio(() */ + blk_queue_exit(q); +} + +/* + * Abort (fail) all plugged BIOs of a zone write plug. + */ +static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) +{ + struct bio *bio; + + if (bio_list_empty(&zwplug->bio_list)) + return; + + pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", + zwplug->disk->disk_name, zwplug->zone_no); + while ((bio = bio_list_pop(&zwplug->bio_list))) + blk_zone_wplug_bio_io_error(zwplug, bio); +} + +/* + * Set a zone write plug write pointer offset to the specified value. + * This aborts all plugged BIOs, which is fine as this function is called for + * a zone reset operation, a zone finish operation or if the zone needs a wp + * update from a report zone after a write error. + */ +static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, + struct blk_zone_wplug *zwplug, + unsigned int wp_offset) +{ + lockdep_assert_held(&zwplug->lock); + + /* Update the zone write pointer and abort all plugged BIOs. */ + zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; + zwplug->wp_offset = wp_offset; + disk_zone_wplug_abort(zwplug); + + /* + * The zone write plug now has no BIO plugged: remove it from the + * hash table so that it cannot be seen. The plug will be freed + * when the last reference is dropped. + */ + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); +} + +static unsigned int blk_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + return 0; + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. + */ + return UINT_MAX; + } +} + +static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone) +{ + struct blk_zone_wplug *zwplug; + unsigned long flags; + + zwplug = disk_get_zone_wplug(disk, zone->start); + if (!zwplug) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) + disk_zone_wplug_set_wp_offset(disk, zwplug, + blk_zone_wp_offset(zone)); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + +static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) +{ + struct disk_report_zones_cb_args args = { + .disk = disk, + }; + + return disk->fops->report_zones(disk, sector, 1, + disk_report_zones_cb, &args); +} + +static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, + unsigned int wp_offset) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* Conventional zones cannot be reset nor finished. */ + if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { + bio_io_error(bio); + return true; + } + + /* + * No-wait reset or finish BIOs do not make much sense as the callers + * issue these as blocking operations in most cases. To avoid issues + * the BIO execution potentially failing with BLK_STS_AGAIN, warn about + * REQ_NOWAIT being set and ignore that flag. + */ + if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) + bio->bi_opf &= ~REQ_NOWAIT; + + /* + * If we have a zone write plug, set its write pointer offset to 0 + * (reset case) or to the zone size (finish case). This will abort all + * BIOs plugged for the target zone. It is fine as resetting or + * finishing zones while writes are still in-flight will result in the + * writes failing anyway. + */ + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + } + + return false; +} + +static bool blk_zone_wplug_handle_reset_all(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + unsigned long flags; + sector_t sector; + + /* + * Set the write pointer offset of all zone write plugs to 0. This will + * abort all plugged BIOs. It is fine as resetting zones while writes + * are still in-flight will result in the writes failing anyway. + */ + for (sector = 0; sector < get_capacity(disk); + sector += disk->queue->limits.chunk_sectors) { + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + } + } + + return false; +} + +static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + /* + * Take a reference on the zone write plug and schedule the submission + * of the next plugged BIO. blk_zone_wplug_bio_work() will release the + * reference we take here. + */ + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + refcount_inc(&zwplug->ref); + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); +} + +static inline void disk_zone_wplug_add_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug, + struct bio *bio, unsigned int nr_segs) +{ + bool schedule_bio_work = false; + + /* + * Grab an extra reference on the BIO request queue usage counter. + * This reference will be reused to submit a request for the BIO for + * blk-mq devices and dropped when the BIO is failed and after + * it is issued in the case of BIO-based devices. + */ + percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); + + /* + * The BIO is being plugged and thus will have to wait for the on-going + * write and for all other writes already plugged. So polling makes + * no sense. + */ + bio_clear_polled(bio); + + /* + * REQ_NOWAIT BIOs are always handled using the zone write plug BIO + * work, which can block. So clear the REQ_NOWAIT flag and schedule the + * work if this is the first BIO we are plugging. + */ + if (bio->bi_opf & REQ_NOWAIT) { + schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); + bio->bi_opf &= ~REQ_NOWAIT; + } + + /* + * Reuse the poll cookie field to store the number of segments when + * split to the hardware limits. + */ + bio->__bi_nr_segments = nr_segs; + + /* + * We always receive BIOs after they are split and ready to be issued. + * The block layer passes the parts of a split BIO in order, and the + * user must also issue write sequentially. So simply add the new BIO + * at the tail of the list to preserve the sequential write order. + */ + bio_list_add(&zwplug->bio_list, bio); + + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + + if (schedule_bio_work) + disk_zone_wplug_schedule_bio_work(disk, zwplug); +} + +/* + * Called from bio_attempt_back_merge() when a BIO was merged with a request. + */ +void blk_zone_write_plug_bio_merged(struct bio *bio) +{ + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * If the BIO was already plugged, then we were called through + * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). + * For this case, we already hold a reference on the zone write plug for + * the BIO and blk_zone_write_plug_init_request() will handle the + * zone write pointer offset update. + */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return; + + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * Get a reference on the zone write plug of the target zone and advance + * the zone write pointer offset. Given that this is a merge, we already + * have at least one request and one BIO referencing the zone write + * plug. So this should not fail. + */ + zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, + bio->bi_iter.bi_sector); + if (WARN_ON_ONCE(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + zwplug->wp_offset += bio_sectors(bio); + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +/* + * Attempt to merge plugged BIOs with a newly prepared request for a BIO that + * already went through zone write plugging (either a new BIO or one that was + * unplugged). + */ +void blk_zone_write_plug_init_request(struct request *req) +{ + sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); + struct request_queue *q = req->q; + struct gendisk *disk = q->disk; + struct blk_zone_wplug *zwplug = + disk_get_zone_wplug(disk, blk_rq_pos(req)); + unsigned long flags; + struct bio *bio; + + if (WARN_ON_ONCE(!zwplug)) + return; + + /* + * Indicate that completion of this request needs to be handled with + * blk_zone_write_plug_finish_request(), which will drop the reference + * on the zone write plug we took above on entry to this function. + */ + req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; + + if (blk_queue_nomerges(q)) + return; + + /* + * Walk through the list of plugged BIOs to check if they can be merged + * into the back of the request. + */ + spin_lock_irqsave(&zwplug->lock, flags); + while (!disk_zone_wplug_is_full(disk, zwplug)) { + bio = bio_list_peek(&zwplug->bio_list); + if (!bio) + break; + + if (bio->bi_iter.bi_sector != req_back_sector || + !blk_rq_merge_ok(req, bio)) + break; + + WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && + !bio->__bi_nr_segments); + + bio_list_pop(&zwplug->bio_list); + if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != + BIO_MERGE_OK) { + bio_list_add_head(&zwplug->bio_list, bio); + break; + } + + /* Drop the reference taken by disk_zone_wplug_add_bio(). */ + blk_queue_exit(q); + zwplug->wp_offset += bio_sectors(bio); + + req_back_sector += bio_sectors(bio); + } + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +/* + * Check and prepare a BIO for submission by incrementing the write pointer + * offset of its zone write plug and changing zone append operations into + * regular write when zone append emulation is needed. + */ +static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, + struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + + lockdep_assert_held(&zwplug->lock); + + /* + * If we lost track of the zone write pointer due to a write error, + * the user must either execute a report zones, reset the zone or finish + * the to recover a reliable write pointer position. Fail BIOs if the + * user did not do that as we cannot handle emulated zone append + * otherwise. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) + return false; + + /* + * Check that the user is not attempting to write to a full zone. + * We know such BIO will fail, and that would potentially overflow our + * write pointer offset beyond the end of the zone. + */ + if (disk_zone_wplug_is_full(disk, zwplug)) + return false; + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + /* + * Use a regular write starting at the current write pointer. + * Similarly to native zone append operations, do not allow + * merging. + */ + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; + bio->bi_iter.bi_sector += zwplug->wp_offset; + + /* + * Remember that this BIO is in fact a zone append operation + * so that we can restore its operation code on completion. + */ + bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); + } else { + /* + * Check for non-sequential writes early as we know that BIOs + * with a start sector not unaligned to the zone write pointer + * will fail. + */ + if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) + return false; + } + + /* Advance the zone write pointer offset. */ + zwplug->wp_offset += bio_sectors(bio); + + return true; +} + +static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; + gfp_t gfp_mask = GFP_NOIO; + unsigned long flags; + + /* + * BIOs must be fully contained within a zone so that we use the correct + * zone write plug for the entire BIO. For blk-mq devices, the block + * layer should already have done any splitting required to ensure this + * and this BIO should thus not be straddling zone boundaries. For + * BIO-based devices, it is the responsibility of the driver to split + * the bio before submitting it. + */ + if (WARN_ON_ONCE(bio_straddles_zones(bio))) { + bio_io_error(bio); + return true; + } + + /* Conventional zones do not need write plugging. */ + if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { + /* Zone append to conventional zones is not allowed. */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + bio_io_error(bio); + return true; + } + return false; + } + + if (bio->bi_opf & REQ_NOWAIT) + gfp_mask = GFP_NOWAIT; + + zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); + if (!zwplug) { + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + else + bio_io_error(bio); + return true; + } + + /* Indicate that this BIO is being handled using zone write plugging. */ + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * If the zone is already plugged, add the BIO to the plug BIO list. + * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a + * BLK_STS_AGAIN failure if we let the BIO execute. + * Otherwise, plug and let the BIO execute. + */ + if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || + (bio->bi_opf & REQ_NOWAIT)) + goto plug; + + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { + spin_unlock_irqrestore(&zwplug->lock, flags); + bio_io_error(bio); + return true; + } + + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + + spin_unlock_irqrestore(&zwplug->lock, flags); + + return false; + +plug: + disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); + + spin_unlock_irqrestore(&zwplug->lock, flags); + + return true; +} + +static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * We have native support for zone append operations, so we are not + * going to handle @bio through plugging. However, we may already have a + * zone write plug for the target zone if that zone was previously + * partially written using regular writes. In such case, we risk leaving + * the plug in the disk hash table if the zone is fully written using + * zone append operations. Avoid this by removing the zone write plug. + */ + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + if (likely(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * We are about to remove the zone write plug. But if the user + * (mistakenly) has issued regular writes together with native zone + * append, we must aborts the writes as otherwise the plugged BIOs would + * not be executed by the plug BIO work as disk_get_zone_wplug() will + * return NULL after the plug is removed. Aborting the plugged write + * BIOs is consistent with the fact that these writes will most likely + * fail anyway as there is no ordering guarantees between zone append + * operations and regular write operations. + */ + if (!bio_list_empty(&zwplug->bio_list)) { + pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", + disk->disk_name, zwplug->zone_no); + disk_zone_wplug_abort(zwplug); + } + disk_remove_zone_wplug(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + +/** + * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging + * @bio: The BIO being submitted + * @nr_segs: The number of physical segments of @bio + * + * Handle write, write zeroes and zone append operations requiring emulation + * using zone write plugging. + * + * Return true whenever @bio execution needs to be delayed through the zone + * write plug. Otherwise, return false to let the submission path process + * @bio normally. + */ +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + struct block_device *bdev = bio->bi_bdev; + + if (!bdev->bd_disk->zone_wplugs_hash) + return false; + + /* + * If the BIO already has the plugging flag set, then it was already + * handled through this path and this is a submission from the zone + * plug bio submit work. + */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return false; + + /* + * We do not need to do anything special for empty flush BIOs, e.g + * BIOs such as issued by blkdev_issue_flush(). The is because it is + * the responsibility of the user to first wait for the completion of + * write operations for flush to have any effect on the persistence of + * the written data. + */ + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + + /* + * Regular writes and write zeroes need to be handled through the target + * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH + * which may need to go through the flush machinery depending on the + * target device capabilities. Plugging such writes is fine as the flush + * machinery operates at the request level, below the plug, and + * completion of the flush sequence will go through the regular BIO + * completion, which will handle zone write plugging. + * Zone append operations for devices that requested emulation must + * also be plugged so that these BIOs can be changed into regular + * write BIOs. + * Zone reset, reset all and finish commands need special treatment + * to correctly track the write pointer offset of zones. These commands + * are not plugged as we do not need serialization with write + * operations. It is the responsibility of the user to not issue reset + * and finish commands when write operations are in flight. + */ + switch (bio_op(bio)) { + case REQ_OP_ZONE_APPEND: + if (!bdev_emulates_zone_append(bdev)) { + blk_zone_wplug_handle_native_zone_append(bio); + return false; + } + fallthrough; + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + return blk_zone_wplug_handle_write(bio, nr_segs); + case REQ_OP_ZONE_RESET: + return blk_zone_wplug_handle_reset_or_finish(bio, 0); + case REQ_OP_ZONE_FINISH: + return blk_zone_wplug_handle_reset_or_finish(bio, + bdev_zone_sectors(bdev)); + case REQ_OP_ZONE_RESET_ALL: + return blk_zone_wplug_handle_reset_all(bio); + default: + return false; + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_zone_plug_bio); + +static void disk_zone_wplug_unplug_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* Schedule submission of the next plugged BIO if we have one. */ + if (!bio_list_empty(&zwplug->bio_list)) { + disk_zone_wplug_schedule_bio_work(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + return; + } + + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If the zone is full (it was fully written or finished, or empty + * (it was reset), remove its zone write plug from the hash table. + */ + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); + + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +void blk_zone_write_plug_bio_endio(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug = + disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + unsigned long flags; + + if (WARN_ON_ONCE(!zwplug)) + return; + + /* Make sure we do not see this BIO again by clearing the plug flag. */ + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * If this is a regular write emulating a zone append operation, + * restore the original operation code. + */ + if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + } + + /* + * If the BIO failed, abort all plugged BIOs and mark the plug as + * needing a write pointer update. + */ + if (bio->bi_status != BLK_STS_OK) { + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_abort(zwplug); + zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; + spin_unlock_irqrestore(&zwplug->lock, flags); + } + + /* Drop the reference we took when the BIO was issued. */ + disk_put_zone_wplug(zwplug); + + /* + * For BIO-based devices, blk_zone_write_plug_finish_request() + * is not called. So we need to schedule execution of the next + * plugged BIO here. + */ + if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) + disk_zone_wplug_unplug_bio(disk, zwplug); + + /* Drop the reference we took when entering this function. */ + disk_put_zone_wplug(zwplug); +} + +void blk_zone_write_plug_finish_request(struct request *req) +{ + struct gendisk *disk = req->q->disk; + struct blk_zone_wplug *zwplug; + + zwplug = disk_get_zone_wplug(disk, req->__sector); + if (WARN_ON_ONCE(!zwplug)) + return; + + req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; + + /* + * Drop the reference we took when the request was initialized in + * blk_zone_write_plug_init_request(). + */ + disk_put_zone_wplug(zwplug); + + disk_zone_wplug_unplug_bio(disk, zwplug); + + /* Drop the reference we took when entering this function. */ + disk_put_zone_wplug(zwplug); +} + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + struct block_device *bdev; + unsigned long flags; + struct bio *bio; + + /* + * Submit the next plugged BIO. If we do not have any, clear + * the plugged flag. + */ + spin_lock_irqsave(&zwplug->lock, flags); + +again: + bio = bio_list_pop(&zwplug->bio_list); + if (!bio) { + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + spin_unlock_irqrestore(&zwplug->lock, flags); + goto put_zwplug; + } + + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { + blk_zone_wplug_bio_io_error(zwplug, bio); + goto again; + } + + spin_unlock_irqrestore(&zwplug->lock, flags); + + bdev = bio->bi_bdev; + submit_bio_noacct_nocheck(bio); + + /* + * blk-mq devices will reuse the extra reference on the request queue + * usage counter we took when the BIO was plugged, but the submission + * path for BIO-based devices will not do that. So drop this extra + * reference here. + */ + if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) + blk_queue_exit(bdev->bd_disk->queue); + +put_zwplug: + /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ + disk_put_zone_wplug(zwplug); +} + +static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) +{ + return 1U << disk->zone_wplugs_hash_bits; +} + +void disk_init_zone_resources(struct gendisk *disk) +{ + spin_lock_init(&disk->zone_wplugs_lock); +} + +/* + * For the size of a disk zone write plug hash table, use the size of the + * zone write plug mempool, which is the maximum of the disk open zones and + * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, + * 9 bits. For a disk that has no limits, mempool size defaults to 128. + */ +#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 +#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 + +static int disk_alloc_zone_resources(struct gendisk *disk, + unsigned int pool_size) +{ + unsigned int i; + + atomic_set(&disk->nr_zone_wplugs, 0); + disk->zone_wplugs_hash_bits = + min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); + + disk->zone_wplugs_hash = + kcalloc(disk_zone_wplugs_hash_size(disk), + sizeof(struct hlist_head), GFP_KERNEL); + if (!disk->zone_wplugs_hash) + return -ENOMEM; + + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) + INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); + + disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, + sizeof(struct blk_zone_wplug)); + if (!disk->zone_wplugs_pool) + goto free_hash; + + disk->zone_wplugs_wq = + alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, + pool_size, disk->disk_name); + if (!disk->zone_wplugs_wq) + goto destroy_pool; + + return 0; + +destroy_pool: + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; +free_hash: + kfree(disk->zone_wplugs_hash); + disk->zone_wplugs_hash = NULL; + disk->zone_wplugs_hash_bits = 0; + return -ENOMEM; +} + +static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + unsigned int i; + + if (!disk->zone_wplugs_hash) + return; + + /* Free all the zone write plugs we have. */ + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + while (!hlist_empty(&disk->zone_wplugs_hash[i])) { + zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, + struct blk_zone_wplug, node); + refcount_inc(&zwplug->ref); + disk_remove_zone_wplug(disk, zwplug); + disk_put_zone_wplug(zwplug); + } + } + + WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); + kfree(disk->zone_wplugs_hash); + disk->zone_wplugs_hash = NULL; + disk->zone_wplugs_hash_bits = 0; +} + +static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, + unsigned long *bitmap) +{ + unsigned int nr_conv_zones = 0; + unsigned long flags; + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + if (bitmap) + nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); + bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, + lockdep_is_held(&disk->zone_wplugs_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + kfree_rcu_mightsleep(bitmap); + + return nr_conv_zones; +} + +void disk_free_zone_resources(struct gendisk *disk) +{ + if (!disk->zone_wplugs_pool) + return; + + if (disk->zone_wplugs_wq) { + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; + } + + disk_destroy_zone_wplugs_hash_table(disk); + + /* + * Wait for the zone write plugs to be RCU-freed before + * destorying the mempool. + */ + rcu_barrier(); + + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; + + disk_set_conv_zones_bitmap(disk, NULL); + disk->zone_capacity = 0; + disk->last_zone_capacity = 0; + disk->nr_zones = 0; +} + +static inline bool disk_need_zone_resources(struct gendisk *disk) { - kfree(disk->conv_zones_bitmap); - disk->conv_zones_bitmap = NULL; - kfree(disk->seq_zones_wlock); - disk->seq_zones_wlock = NULL; + /* + * All mq zoned devices need zone resources so that the block layer + * can automatically handle write BIO plugging. BIO-based device drivers + * (e.g. DM devices) are normally responsible for handling zone write + * ordering and do not need zone resources, unless the driver requires + * zone append emulation. + */ + return queue_is_mq(disk->queue) || + queue_emulates_zone_append(disk->queue); +} + +static int disk_revalidate_zone_resources(struct gendisk *disk, + unsigned int nr_zones) +{ + struct queue_limits *lim = &disk->queue->limits; + unsigned int pool_size; + + if (!disk_need_zone_resources(disk)) + return 0; + + /* + * If the device has no limit on the maximum number of open and active + * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. + */ + pool_size = max(lim->max_open_zones, lim->max_active_zones); + if (!pool_size) + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); + + if (!disk->zone_wplugs_hash) + return disk_alloc_zone_resources(disk, pool_size); + + return 0; } struct blk_revalidate_zone_args { struct gendisk *disk; unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; unsigned int nr_zones; + unsigned int zone_capacity; + unsigned int last_zone_capacity; sector_t sector; }; /* + * Update the disk zone resources information and device queue limits. + * The disk queue is frozen when this is executed. + */ +static int disk_update_zone_resources(struct gendisk *disk, + struct blk_revalidate_zone_args *args) +{ + struct request_queue *q = disk->queue; + unsigned int nr_seq_zones, nr_conv_zones; + unsigned int pool_size; + struct queue_limits lim; + + disk->nr_zones = args->nr_zones; + disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; + nr_conv_zones = + disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); + if (nr_conv_zones >= disk->nr_zones) { + pr_warn("%s: Invalid number of conventional zones %u / %u\n", + disk->disk_name, nr_conv_zones, disk->nr_zones); + return -ENODEV; + } + + lim = queue_limits_start_update(q); + + /* + * Some devices can advertize zone resource limits that are larger than + * the number of sequential zones of the zoned block device, e.g. a + * small ZNS namespace. For such case, assume that the zoned device has + * no zone resource limits. + */ + nr_seq_zones = disk->nr_zones - nr_conv_zones; + if (lim.max_open_zones >= nr_seq_zones) + lim.max_open_zones = 0; + if (lim.max_active_zones >= nr_seq_zones) + lim.max_active_zones = 0; + + if (!disk->zone_wplugs_pool) + goto commit; + + /* + * If the device has no limit on the maximum number of open and active + * zones, set its max open zone limit to the mempool size to indicate + * to the user that there is a potential performance impact due to + * dynamic zone write plug allocation when simultaneously writing to + * more zones than the size of the mempool. + */ + pool_size = max(lim.max_open_zones, lim.max_active_zones); + if (!pool_size) + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); + + mempool_resize(disk->zone_wplugs_pool, pool_size); + + if (!lim.max_open_zones && !lim.max_active_zones) { + if (pool_size < nr_seq_zones) + lim.max_open_zones = pool_size; + else + lim.max_open_zones = 0; + } + +commit: + return queue_limits_commit_update_frozen(q, &lim); +} + +static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + struct gendisk *disk = args->disk; + + if (zone->capacity != zone->len) { + pr_warn("%s: Invalid conventional zone capacity\n", + disk->disk_name); + return -ENODEV; + } + + if (disk_zone_is_last(disk, zone)) + args->last_zone_capacity = zone->capacity; + + if (!disk_need_zone_resources(disk)) + return 0; + + if (!args->conv_zones_bitmap) { + args->conv_zones_bitmap = + bitmap_zalloc(args->nr_zones, GFP_NOIO); + if (!args->conv_zones_bitmap) + return -ENOMEM; + } + + set_bit(idx, args->conv_zones_bitmap); + + return 0; +} + +static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + struct gendisk *disk = args->disk; + struct blk_zone_wplug *zwplug; + unsigned int wp_offset; + unsigned long flags; + + /* + * Remember the capacity of the first sequential zone and check + * if it is constant for all zones, ignoring the last zone as it can be + * smaller. + */ + if (!args->zone_capacity) + args->zone_capacity = zone->capacity; + if (disk_zone_is_last(disk, zone)) { + args->last_zone_capacity = zone->capacity; + } else if (zone->capacity != args->zone_capacity) { + pr_warn("%s: Invalid variable zone capacity\n", + disk->disk_name); + return -ENODEV; + } + + /* + * If the device needs zone append emulation, we need to track the + * write pointer of all zones that are not empty nor full. So make sure + * we have a zone write plug for such zone if the device has a zone + * write plug hash table. + */ + if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) + return 0; + + disk_zone_wplug_sync_wp_offset(disk, zone); + + wp_offset = blk_zone_wp_offset(zone); + if (!wp_offset || wp_offset >= zone->capacity) + return 0; + + zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); + if (!zwplug) + return -ENOMEM; + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + + return 0; +} + +/* * Helper function to check the validity of zones of a zoned block device. */ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, @@ -449,9 +1645,8 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, { struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; - struct request_queue *q = disk->queue; - sector_t capacity = get_capacity(disk); - sector_t zone_sectors = q->limits.chunk_sectors; + sector_t zone_sectors = disk->queue->limits.chunk_sectors; + int ret; /* Check for bad zones and holes in the zone report */ if (zone->start != args->sector) { @@ -460,7 +1655,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } - if (zone->start >= capacity || !zone->len) { + if (zone->start >= get_capacity(disk) || !zone->len) { pr_warn("%s: Invalid zone start %llu, length %llu\n", disk->disk_name, zone->start, zone->len); return -ENODEV; @@ -470,7 +1665,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start + zone->len < capacity) { + if (!disk_zone_is_last(disk, zone)) { if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); @@ -482,66 +1677,57 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + if (!zone->capacity || zone->capacity > zone->len) { + pr_warn("%s: Invalid zone capacity\n", + disk->disk_name); + return -ENODEV; + } + /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: - if (!args->conv_zones_bitmap) { - args->conv_zones_bitmap = - blk_alloc_zone_bitmap(q->node, args->nr_zones); - if (!args->conv_zones_bitmap) - return -ENOMEM; - } - set_bit(idx, args->conv_zones_bitmap); + ret = blk_revalidate_conv_zone(zone, idx, args); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: - if (!args->seq_zones_wlock) { - args->seq_zones_wlock = - blk_alloc_zone_bitmap(q->node, args->nr_zones); - if (!args->seq_zones_wlock) - return -ENOMEM; - } + ret = blk_revalidate_seq_zone(zone, idx, args); break; case BLK_ZONE_TYPE_SEQWRITE_PREF: default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); - return -ENODEV; + ret = -ENODEV; } - args->sector += zone->len; - return 0; + if (!ret) + args->sector += zone->len; + + return ret; } /** - * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps + * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs * @disk: Target disk - * @update_driver_data: Callback to update driver data on the frozen disk * - * Helper function for low-level device drivers to check and (re) allocate and - * initialize a disk request queue zone bitmaps. This functions should normally - * be called within the disk ->revalidate method for blk-mq based drivers. + * Helper function for low-level device drivers to check, (re) allocate and + * initialize resources used for managing zoned disks. This function should + * normally be called by blk-mq based drivers when a zoned gendisk is probed + * and when the zone configuration of the gendisk changes (e.g. after a format). * Before calling this function, the device driver must already have set the * device zone size (chunk_sector limit) and the max zone append limit. - * For BIO based drivers, this function cannot be used. BIO based device drivers - * only need to set disk->nr_zones so that the sysfs exposed value is correct. - * If the @update_driver_data callback function is not NULL, the callback is - * executed with the device request queue frozen after all zones have been - * checked. + * BIO based drivers can also use this function as long as the device queue + * can be safely frozen. */ -int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)) +int blk_revalidate_disk_zones(struct gendisk *disk) { struct request_queue *q = disk->queue; sector_t zone_sectors = q->limits.chunk_sectors; sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; unsigned int noio_flag; - int ret; + int ret = -ENOMEM; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) return -EIO; - if (WARN_ON_ONCE(!queue_is_mq(q))) - return -EIO; if (!capacity) return -ENODEV; @@ -556,12 +1742,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return -ENODEV; } - if (!q->limits.max_zone_append_sectors) { - pr_warn("%s: Invalid 0 maximum zone append limit\n", - disk->disk_name); - return -ENODEV; - } - /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. @@ -569,6 +1749,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk, args.disk = disk; args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); + ret = disk_revalidate_zone_resources(disk, args.nr_zones); + if (ret) { + memalloc_noio_restore(noio_flag); + return ret; + } + ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); if (!ret) { @@ -588,26 +1774,105 @@ int blk_revalidate_disk_zones(struct gendisk *disk, } /* - * Install the new bitmaps and update nr_zones only once the queue is - * stopped and all I/Os are completed (i.e. a scheduler is not - * referencing the bitmaps). + * Set the new disk zone parameters only once the queue is frozen and + * all I/Os are completed. */ - blk_mq_freeze_queue(q); - if (ret > 0) { - disk->nr_zones = args.nr_zones; - swap(disk->seq_zones_wlock, args.seq_zones_wlock); - swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); - if (update_driver_data) - update_driver_data(disk); - ret = 0; - } else { + if (ret > 0) + ret = disk_update_zone_resources(disk, &args); + else pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - disk_free_zone_bitmaps(disk); + if (ret) { + unsigned int memflags = blk_mq_freeze_queue(q); + + disk_free_zone_resources(disk); + blk_mq_unfreeze_queue(q, memflags); } - blk_mq_unfreeze_queue(q); - kfree(args.seq_zones_wlock); - kfree(args.conv_zones_bitmap); return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); + +/** + * blk_zone_issue_zeroout - zero-fill a block range in a zone + * @bdev: blockdev to write + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * + * Description: + * Zero-fill a block range in a zone (@sector must be equal to the zone write + * pointer), handling potential errors due to the (initially unknown) lack of + * hardware offload (See blkdev_issue_zeroout()). + */ +int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) +{ + int ret; + + if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) + return -EIO; + + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, + BLKDEV_ZERO_NOFALLBACK); + if (ret != -EOPNOTSUPP) + return ret; + + /* + * The failed call to blkdev_issue_zeroout() advanced the zone write + * pointer. Undo this using a report zone to update the zone write + * pointer to the correct current value. + */ + ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); + if (ret != 1) + return ret < 0 ? ret : -EIO; + + /* + * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a + * regular write with zero-pages. + */ + return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); +} +EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); + +#ifdef CONFIG_BLK_DEBUG_FS +static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, + struct seq_file *m) +{ + unsigned int zwp_wp_offset, zwp_flags; + unsigned int zwp_zone_no, zwp_ref; + unsigned int zwp_bio_list_size; + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + zwp_zone_no = zwplug->zone_no; + zwp_flags = zwplug->flags; + zwp_ref = refcount_read(&zwplug->ref); + zwp_wp_offset = zwplug->wp_offset; + zwp_bio_list_size = bio_list_size(&zwplug->bio_list); + spin_unlock_irqrestore(&zwplug->lock, flags); + + seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, + zwp_wp_offset, zwp_bio_list_size); +} + +int queue_zone_wplugs_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct gendisk *disk = q->disk; + struct blk_zone_wplug *zwplug; + unsigned int i; + + if (!disk->zone_wplugs_hash) + return 0; + + rcu_read_lock(); + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], + node) + queue_zone_wplug_show(zwplug, m); + rcu_read_unlock(); + + return 0; +} + +#endif diff --git a/block/blk.h b/block/blk.h index d9f584984bc4..37ec459fe656 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,7 +2,9 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H +#include <linux/bio-integrity.h> #include <linux/blk-crypto.h> +#include <linux/lockdep.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/sched/sysctl.h> #include <linux/timekeeping.h> @@ -11,6 +13,9 @@ struct elevator_type; +#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) +#define BLK_MIN_SEGMENT_SIZE 4096 + /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -33,11 +38,13 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); -void blk_freeze_queue(struct request_queue *q); -void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); -void blk_queue_start_drain(struct request_queue *q); +bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +bool blk_queue_start_drain(struct request_queue *q); +bool __blk_freeze_queue_start(struct request_queue *q, + struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); +void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { @@ -67,8 +74,11 @@ static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (blk_try_enter_queue(q, false)) + if (blk_try_enter_queue(q, false)) { + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; + } return __bio_queue_enter(q, bio); } @@ -84,21 +94,23 @@ static inline void blk_wait_io(struct completion *done) wait_for_completion_io(done); } +struct block_device *blkdev_get_no_open(dev_t dev, bool autoload); +void blkdev_put_no_open(struct block_device *bdev); + #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, - struct page *page, unsigned len, unsigned offset, - bool *same_page); + struct page *page, unsigned len, unsigned offset); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; - phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + phys_addr_t addr1 = bvec_phys(vec1); + phys_addr_t addr2 = bvec_phys(vec2); /* * Merging adjacent physical pages may not work correctly under KMSAN @@ -180,9 +192,11 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) return queue_max_segments(rq->q); } -static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, - enum req_op op) +static inline unsigned int blk_queue_get_max_sectors(struct request *rq) { + struct request_queue *q = rq->q; + enum req_op op = req_op(rq); + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); @@ -190,16 +204,28 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; + if (rq->cmd_flags & REQ_ATOMIC) + return q->limits.atomic_write_max_sectors; + return q->limits.max_sectors; } #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); -bool __bio_integrity_endio(struct bio *); void bio_integrity_free(struct bio *bio); + +/* + * Integrity payloads can either be owned by the submitter, in which case + * bio_uninit will free them, or owned and generated by the block layer, + * in which case we'll verify them here (for reads) and free them before + * the bio is handed back to the submitted. + */ +bool __bio_integrity_endio(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { - if (bio_integrity(bio)) + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bip && (bip->bip_flags & BIP_BLOCK_INTEGRITY)) return __bio_integrity_endio(bio); return true; } @@ -269,6 +295,14 @@ static inline void bio_integrity_free(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); +enum bio_merge_status { + BIO_MERGE_OK, + BIO_MERGE_NONE, + BIO_MERGE_FAILED, +}; + +enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, unsigned int nr_segs); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, @@ -287,11 +321,9 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -int elevator_switch(struct request_queue *q, struct elevator_type *new_e); -void elevator_disable(struct request_queue *q); -void elevator_exit(struct request_queue *q); -int elv_register_queue(struct request_queue *q, bool uevent); -void elv_unregister_queue(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q); +void elevator_set_default(struct request_queue *q); +void elevator_set_none(struct request_queue *q); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -307,33 +339,92 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -static inline bool bio_may_exceed_limits(struct bio *bio, - const struct queue_limits *lim) +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs); +struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs); +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *nr_segs); +struct bio *bio_split_zone_append(struct bio *bio, + const struct queue_limits *lim, unsigned *nr_segs); + +/* + * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. + * + * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is + * always valid if a bio has data. The check might lead to occasional false + * positives when bios are cloned, but compared to the performance impact of + * cloned bios themselves the loop below doesn't matter anyway. + */ +static inline bool bio_may_need_split(struct bio *bio, + const struct queue_limits *lim) +{ + if (lim->chunk_sectors) + return true; + if (bio->bi_vcnt != 1) + return true; + return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > + lim->min_segment_size; +} + +/** + * __bio_split_to_limits - split a bio to fit the queue limits + * @bio: bio to be split + * @lim: queue limits to split based on + * @nr_segs: returns the number of segments in the returned bio + * + * Check if @bio needs splitting based on the queue limits, and if so split off + * a bio fitting the limits from the beginning of @bio and return it. @bio is + * shortened to the remainder and re-submitted. + * + * The split bio is allocated from @q->bio_split, which is provided by the + * block layer. + */ +static inline struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, unsigned int *nr_segs) { switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + if (bio_may_need_split(bio, lim)) + return bio_split_rw(bio, lim, nr_segs); + *nr_segs = 1; + return bio; + case REQ_OP_ZONE_APPEND: + return bio_split_zone_append(bio, lim, nr_segs); case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: - return true; /* non-trivial splitting decisions */ + return bio_split_write_zeroes(bio, lim, nr_segs); default: - break; + /* other operations can't be split */ + *nr_segs = 0; + return bio; } +} +/** + * get_max_segment_size() - maximum number of bytes to add as a single segment + * @lim: Request queue limits. + * @paddr: address of the range to add + * @len: maximum length available to add at @paddr + * + * Returns the maximum number of bytes of the range starting at @paddr that can + * be added to a single segment. + */ +static inline unsigned get_max_segment_size(const struct queue_limits *lim, + phys_addr_t paddr, unsigned int len) +{ /* - * All drivers must accept single-segments bios that are <= PAGE_SIZE. - * This is a quick and dirty check that relies on the fact that - * bi_io_vec[0] is always valid if a bio has data. The check might - * lead to occasional false negatives when bios are cloned, but compared - * to the performance impact of cloned bios themselves the loop below - * doesn't matter anyway. + * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 + * after having calculated the minimum. */ - return lim->chunk_sectors || bio->bi_vcnt != 1 || - bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; + return min_t(unsigned long, len, + min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr), + (unsigned long)lim->max_segment_size - 1) + 1); } -struct bio *__bio_split_to_limits(struct bio *bio, - const struct queue_limits *lim, - unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, @@ -343,19 +434,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); int blk_set_default_limits(struct queue_limits *lim); +void blk_apply_bdi_limits(struct backing_dev_info *bdi, + struct queue_limits *lim); int blk_dev_init(void); -/* - * Contribute to IO statistics IFF: - * - * a) it's attached to a gendisk, and - * b) the queue had IO stats enabled when this request was started - */ -static inline bool blk_do_io_stat(struct request *rq) -{ - return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); -} - void update_io_ticks(struct block_device *part, unsigned long now, bool end); static inline void req_set_nomerge(struct request_queue *q, struct request *req) @@ -378,42 +460,78 @@ static inline void ioc_clear_queue(struct request_queue *q) } #endif /* CONFIG_BLK_ICQ */ -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); -extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, - const char *page, size_t count); -extern void blk_throtl_bio_endio(struct bio *bio); -extern void blk_throtl_stat_add(struct request *rq, u64 time); -#else -static inline void blk_throtl_bio_endio(struct bio *bio) { } -static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } -#endif - -struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); - -static inline bool blk_queue_may_bounce(struct request_queue *q) +#ifdef CONFIG_BLK_DEV_ZONED +void disk_init_zone_resources(struct gendisk *disk); +void disk_free_zone_resources(struct gendisk *disk); +static inline bool bio_zone_write_plugging(struct bio *bio) { - return IS_ENABLED(CONFIG_BOUNCE) && - q->limits.bounce == BLK_BOUNCE_HIGH && - max_low_pfn >= max_pfn; + return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } - -static inline struct bio *blk_queue_bounce(struct bio *bio, - struct request_queue *q) +void blk_zone_write_plug_bio_merged(struct bio *bio); +void blk_zone_write_plug_init_request(struct request *rq); +static inline void blk_zone_update_request_bio(struct request *rq, + struct bio *bio) { - if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) - return __blk_queue_bounce(bio, q); - return bio; + /* + * For zone append requests, the request sector indicates the location + * at which the BIO data was written. Return this value to the BIO + * issuer through the BIO iter sector. + * For plugged zone writes, which include emulated zone append, we need + * the original BIO sector so that blk_zone_write_plug_bio_endio() can + * lookup the zone write plug. + */ + if (req_op(rq) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) + bio->bi_iter.bi_sector = rq->__sector; +} +void blk_zone_write_plug_bio_endio(struct bio *bio); +static inline void blk_zone_bio_endio(struct bio *bio) +{ + /* + * For write BIOs to zoned devices, signal the completion of the BIO so + * that the next write BIO can be submitted by zone write plugging. + */ + if (bio_zone_write_plugging(bio)) + blk_zone_write_plug_bio_endio(bio); } -#ifdef CONFIG_BLK_DEV_ZONED -void disk_free_zone_bitmaps(struct gendisk *disk); +void blk_zone_write_plug_finish_request(struct request *rq); +static inline void blk_zone_finish_request(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING) + blk_zone_write_plug_finish_request(rq); +} int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ -static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} +static inline void disk_init_zone_resources(struct gendisk *disk) +{ +} +static inline void disk_free_zone_resources(struct gendisk *disk) +{ +} +static inline bool bio_zone_write_plugging(struct bio *bio) +{ + return false; +} +static inline void blk_zone_write_plug_bio_merged(struct bio *bio) +{ +} +static inline void blk_zone_write_plug_init_request(struct request *rq) +{ +} +static inline void blk_zone_update_request_bio(struct request *rq, + struct bio *bio) +{ +} +static inline void blk_zone_bio_endio(struct bio *bio) +{ +} +static inline void blk_zone_finish_request(struct request *rq) +{ +} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { @@ -428,12 +546,15 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); +void bdev_unhash(struct block_device *bdev); +void bdev_drop(struct block_device *bdev); int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 +#define ADDPART_FLAG_READONLY 4 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); @@ -446,10 +567,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page); - /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. @@ -481,6 +598,7 @@ blk_mode_t file_to_blk_mode(struct file *file); int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); +int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); extern const struct address_space_operations def_blk_aops; @@ -600,4 +718,34 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); +void blk_integrity_generate(struct bio *bio); +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter); +void blk_integrity_prepare(struct request *rq); +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); + +#ifdef CONFIG_LOCKDEP +static inline void blk_freeze_acquire_lock(struct request_queue *q) +{ + if (!q->mq_freeze_disk_dead) + rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); + if (!q->mq_freeze_queue_dying) + rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); +} + +static inline void blk_unfreeze_release_lock(struct request_queue *q) +{ + if (!q->mq_freeze_queue_dying) + rwsem_release(&q->q_lockdep_map, _RET_IP_); + if (!q->mq_freeze_disk_dead) + rwsem_release(&q->io_lockdep_map, _RET_IP_); +} +#else +static inline void blk_freeze_acquire_lock(struct request_queue *q) +{ +} +static inline void blk_unfreeze_release_lock(struct request_queue *q) +{ +} +#endif + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c deleted file mode 100644 index d6a5219f29dd..000000000000 --- a/block/bounce.c +++ /dev/null @@ -1,269 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* bounce buffer handling for block devices - * - * - Split from highmem.c - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/mm.h> -#include <linux/export.h> -#include <linux/swap.h> -#include <linux/gfp.h> -#include <linux/bio.h> -#include <linux/pagemap.h> -#include <linux/mempool.h> -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/init.h> -#include <linux/hash.h> -#include <linux/highmem.h> -#include <linux/printk.h> -#include <asm/tlbflush.h> - -#include <trace/events/block.h> -#include "blk.h" -#include "blk-cgroup.h" - -#define POOL_SIZE 64 -#define ISA_POOL_SIZE 16 - -static struct bio_set bounce_bio_set, bounce_bio_split; -static mempool_t page_pool; - -static void init_bounce_bioset(void) -{ - static bool bounce_bs_setup; - int ret; - - if (bounce_bs_setup) - return; - - ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - BUG_ON(ret); - if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) - BUG_ON(1); - - ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); - BUG_ON(ret); - bounce_bs_setup = true; -} - -static __init int init_emergency_pool(void) -{ - int ret; - -#ifndef CONFIG_MEMORY_HOTPLUG - if (max_pfn <= max_low_pfn) - return 0; -#endif - - ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0); - BUG_ON(ret); - pr_info("pool size: %d pages\n", POOL_SIZE); - - init_bounce_bioset(); - return 0; -} - -__initcall(init_emergency_pool); - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - struct bio_vec tovec, fromvec; - struct bvec_iter iter; - /* - * The bio of @from is created by bounce, so we can iterate - * its bvec from start to end, but the @from->bi_iter can't be - * trusted because it might be changed by splitting. - */ - struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; - - bio_for_each_segment(tovec, to, iter) { - fromvec = bio_iter_iovec(from, from_iter); - if (tovec.bv_page != fromvec.bv_page) { - /* - * fromvec->bv_offset and fromvec->bv_len might have - * been modified by the block layer, so use the original - * copy, bounce_copy_vec already uses tovec->bv_len - */ - memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + - tovec.bv_offset); - } - bio_advance_iter(from, &from_iter, tovec.bv_len); - } -} - -static void bounce_end_io(struct bio *bio) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, orig_vec; - struct bvec_iter orig_iter = bio_orig->bi_iter; - struct bvec_iter_all iter_all; - - /* - * free up bounce indirect pages used - */ - bio_for_each_segment_all(bvec, bio, iter_all) { - orig_vec = bio_iter_iovec(bio_orig, orig_iter); - if (bvec->bv_page != orig_vec.bv_page) { - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, &page_pool); - } - bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); - } - - bio_orig->bi_status = bio->bi_status; - bio_endio(bio_orig); - bio_put(bio); -} - -static void bounce_end_io_write(struct bio *bio) -{ - bounce_end_io(bio); -} - -static void bounce_end_io_read(struct bio *bio) -{ - struct bio *bio_orig = bio->bi_private; - - if (!bio->bi_status) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio); -} - -static struct bio *bounce_clone_bio(struct bio *bio_src) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_VECS biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_VECS. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work. - */ - bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src), - bio_src->bi_opf, GFP_NOIO, &bounce_bio_set); - if (bio_flagged(bio_src, BIO_REMAPPED)) - bio_set_flag(bio, BIO_REMAPPED); - bio->bi_ioprio = bio_src->bi_ioprio; - bio->bi_write_hint = bio_src->bi_write_hint; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - break; - default: - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - break; - } - - if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0) - goto err_put; - - if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0) - goto err_put; - - bio_clone_blkg_association(bio, bio_src); - - return bio; - -err_put: - bio_put(bio); - return NULL; -} - -struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q) -{ - struct bio *bio; - int rw = bio_data_dir(bio_orig); - struct bio_vec *to, from; - struct bvec_iter iter; - unsigned i = 0, bytes = 0; - bool bounce = false; - int sectors; - - bio_for_each_segment(from, bio_orig, iter) { - if (i++ < BIO_MAX_VECS) - bytes += from.bv_len; - if (PageHighMem(from.bv_page)) - bounce = true; - } - if (!bounce) - return bio_orig; - - /* - * Individual bvecs might not be logical block aligned. Round down - * the split size so that each bio is properly block size aligned, - * even if we do not use the full hardware limits. - */ - sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> - SECTOR_SHIFT; - if (sectors < bio_sectors(bio_orig)) { - bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split); - bio_chain(bio, bio_orig); - submit_bio_noacct(bio_orig); - bio_orig = bio; - } - bio = bounce_clone_bio(bio_orig); - - /* - * Bvec table can't be updated by bio_for_each_segment_all(), - * so retrieve bvec from the table directly. This way is safe - * because the 'bio' is single-page bvec. - */ - for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *bounce_page; - - if (!PageHighMem(to->bv_page)) - continue; - - bounce_page = mempool_alloc(&page_pool, GFP_NOIO); - inc_zone_page_state(bounce_page, NR_BOUNCE); - - if (rw == WRITE) { - flush_dcache_page(to->bv_page); - memcpy_from_bvec(page_address(bounce_page), to); - } - to->bv_page = bounce_page; - } - - trace_block_bio_bounce(bio_orig); - - bio->bi_flags |= (1 << BIO_BOUNCED); - - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - else - bio->bi_end_io = bounce_end_io_write; - - bio->bi_private = bio_orig; - return bio; -} diff --git a/block/bsg-lib.c b/block/bsg-lib.c index bcc7dee6abce..9ceb5d0832f5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); - buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list); buf->payload_len = blk_rq_bytes(req); return 0; } @@ -354,12 +354,14 @@ static const struct blk_mq_ops bsg_mq_ops = { * bsg_setup_queue - Create and add the bsg hooks so we can receive requests * @dev: device to attach bsg device to * @name: device to give bsg device + * @lim: queue limits for the bsg queue * @job_fn: bsg job handler * @timeout: timeout handler function pointer * @dd_job_size: size of LLD data needed for each job */ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, - bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size) + struct queue_limits *lim, bsg_job_fn *job_fn, + bsg_timeout_fn *timeout, int dd_job_size) { struct bsg_set *bset; struct blk_mq_tag_set *set; @@ -379,17 +381,16 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; set->cmd_size = sizeof(struct bsg_job) + dd_job_size; - set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; + set->flags = BLK_MQ_F_BLOCKING; if (blk_mq_alloc_tag_set(set)) goto out_tag_set; - q = blk_mq_alloc_queue(set, NULL, NULL); + q = blk_mq_alloc_queue(set, lim, dev); if (IS_ERR(q)) { ret = PTR_ERR(q); goto out_queue; } - q->queuedata = dev; blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn); diff --git a/block/early-lookup.c b/block/early-lookup.c index 3effbd0d35e9..3fb57f7d2b12 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -78,7 +78,7 @@ static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt) * to the partition number found by UUID. */ *devt = part_devt(dev_to_disk(dev), - dev_to_bdev(dev)->bd_partno + offset); + bdev_partno(dev_to_bdev(dev)) + offset); } else { *devt = dev->devt; } diff --git a/block/elevator.c b/block/elevator.c index 5ff093cb3cf8..ab22542e6cf0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -45,6 +45,17 @@ #include "blk-wbt.h" #include "blk-cgroup.h" +/* Holding context data for changing elevator */ +struct elv_change_ctx { + const char *name; + bool no_uevent; + + /* for unregistering old elevator */ + struct elevator_queue *old; + /* for registering new elevator */ + struct elevator_queue *new; +}; + static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -83,13 +94,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static inline bool elv_support_features(struct request_queue *q, - const struct elevator_type *e) -{ - return (q->required_elevator_features & e->elevator_features) == - q->required_elevator_features; -} - /** * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test @@ -113,14 +117,13 @@ static struct elevator_type *__elevator_find(const char *name) return NULL; } -static struct elevator_type *elevator_find_get(struct request_queue *q, - const char *name) +static struct elevator_type *elevator_find_get(const char *name) { struct elevator_type *e; spin_lock(&elv_list_lock); e = __elevator_find(name); - if (e && (!elv_support_features(q, e) || !elevator_tryget(e))) + if (e && (!elevator_tryget(e))) e = NULL; spin_unlock(&elv_list_lock); return e; @@ -156,18 +159,18 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void elevator_exit(struct request_queue *q) +static void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; + lockdep_assert_held(&q->elevator_lock); + ioc_clear_queue(q); blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); - - kobject_put(&e->kobj); } static inline void __elv_rqhash_del(struct request *rq) @@ -413,21 +416,22 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) +#define to_elv(atr) container_of_const((atr), struct elv_fs_entry, attr) static ssize_t elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { - struct elv_fs_entry *entry = to_elv(attr); + const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; - ssize_t error; + ssize_t error = -ENODEV; if (!entry->show) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->type ? entry->show(e, page) : -ENOENT; + if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags)) + error = entry->show(e, page); mutex_unlock(&e->sysfs_lock); return error; } @@ -436,16 +440,17 @@ static ssize_t elv_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { - struct elv_fs_entry *entry = to_elv(attr); + const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; - ssize_t error; + ssize_t error = -ENODEV; if (!entry->store) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); - error = e->type ? entry->store(e, page, length) : -ENOENT; + if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags)) + error = entry->store(e, page, length); mutex_unlock(&e->sysfs_lock); return error; } @@ -460,16 +465,15 @@ static const struct kobj_type elv_ktype = { .release = elevator_release, }; -int elv_register_queue(struct request_queue *q, bool uevent) +static int elv_register_queue(struct request_queue *q, + struct elevator_queue *e, + bool uevent) { - struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { - struct elv_fs_entry *attr = e->type->elevator_attrs; + const struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { while (attr->attr.name) { if (sysfs_create_file(&e->kobj, &attr->attr)) @@ -480,20 +484,25 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); + /* + * Sched is initialized, it is ready to export it via + * debugfs + */ + blk_mq_sched_reg_debugfs(q); set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } -void elv_unregister_queue(struct request_queue *q) +static void elv_unregister_queue(struct request_queue *q, + struct elevator_queue *e) { - struct elevator_queue *e = q->elevator; - - lockdep_assert_held(&q->sysfs_lock); - if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + + /* unexport via debugfs before exiting sched */ + blk_mq_sched_unreg_debugfs(q); } } @@ -555,238 +564,267 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static inline bool elv_support_iosched(struct request_queue *q) -{ - if (!queue_is_mq(q) || - (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) - return false; - return true; -} - /* - * For single queue devices, default to using mq-deadline. If we have multiple - * queues or mq-deadline is not available, default to "none". + * Switch to new_e io scheduler. + * + * If switching fails, we are most likely running out of memory and not able + * to restore the old io scheduler, so leaving the io scheduler being none. */ -static struct elevator_type *elevator_get_default(struct request_queue *q) +static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) { - if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) - return NULL; + struct elevator_type *new_e = NULL; + int ret = 0; - if (q->nr_hw_queues != 1 && - !blk_mq_is_shared_tags(q->tag_set->flags)) - return NULL; + WARN_ON_ONCE(q->mq_freeze_depth == 0); + lockdep_assert_held(&q->elevator_lock); - return elevator_find_get(q, "mq-deadline"); -} + if (strncmp(ctx->name, "none", 4)) { + new_e = elevator_find_get(ctx->name); + if (!new_e) + return -EINVAL; + } -/* - * Get the first elevator providing the features required by the request queue. - * Default to "none" if no matching elevator is found. - */ -static struct elevator_type *elevator_get_by_features(struct request_queue *q) -{ - struct elevator_type *e, *found = NULL; + blk_mq_quiesce_queue(q); - spin_lock(&elv_list_lock); + if (q->elevator) { + ctx->old = q->elevator; + elevator_exit(q); + } - list_for_each_entry(e, &elv_list, list) { - if (elv_support_features(q, e)) { - found = e; - break; - } + if (new_e) { + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out_unfreeze; + ctx->new = q->elevator; + } else { + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; } + blk_add_trace_msg(q, "elv switch: %s", ctx->name); - if (found && !elevator_tryget(found)) - found = NULL; +out_unfreeze: + blk_mq_unquiesce_queue(q); - spin_unlock(&elv_list_lock); - return found; + if (ret) { + pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", + new_e->elevator_name); + } + + if (new_e) + elevator_put(new_e); + return ret; } -/* - * For a device queue that has no required features, use the default elevator - * settings. Otherwise, use the first elevator available matching the required - * features. If no suitable elevator is find or if the chosen elevator - * initialization fails, fall back to the "none" elevator (no elevator). - */ -void elevator_init_mq(struct request_queue *q) +static void elv_exit_and_release(struct request_queue *q) { - struct elevator_type *e; - int err; + struct elevator_queue *e; + unsigned memflags; - if (!elv_support_iosched(q)) - return; + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + e = q->elevator; + elevator_exit(q); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + if (e) + kobject_put(&e->kobj); +} - WARN_ON_ONCE(blk_queue_registered(q)); +static int elevator_change_done(struct request_queue *q, + struct elv_change_ctx *ctx) +{ + int ret = 0; - if (unlikely(q->elevator)) - return; + if (ctx->old) { + bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, + &ctx->old->flags); - if (!q->required_elevator_features) - e = elevator_get_default(q); - else - e = elevator_get_by_features(q); - if (!e) - return; + elv_unregister_queue(q, ctx->old); + kobject_put(&ctx->old->kobj); + if (enable_wbt) + wbt_enable_default(q->disk); + } + if (ctx->new) { + ret = elv_register_queue(q, ctx->new, !ctx->no_uevent); + if (ret) + elv_exit_and_release(q); + } + return ret; +} + +/* + * Switch this queue to the given IO scheduler. + */ +static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) +{ + unsigned int memflags; + int ret = 0; + + lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); + memflags = blk_mq_freeze_queue(q); /* - * We are called before adding disk, when there isn't any FS I/O, + * May be called before adding disk, when there isn't any FS I/O, * so freezing queue plus canceling dispatch work is enough to * drain any dispatch activities originated from passthrough * requests, then no need to quiesce queue which may add long boot * latency, especially when lots of disks are involved. + * + * Disk isn't added yet, so verifying queue lock only manually. */ - blk_mq_freeze_queue(q); blk_mq_cancel_work_sync(q); + mutex_lock(&q->elevator_lock); + if (!(q->elevator && elevator_match(q->elevator->type, ctx->name))) + ret = elevator_switch(q, ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); + if (!ret) + ret = elevator_change_done(q, ctx); - err = blk_mq_init_sched(q, e); - - blk_mq_unfreeze_queue(q); - - if (err) { - pr_warn("\"%s\" elevator initialization failed, " - "falling back to \"none\"\n", e->elevator_name); - } - - elevator_put(e); + return ret; } /* - * Switch to new_e io scheduler. - * - * If switching fails, we are most likely running out of memory and not able - * to restore the old io scheduler, so leaving the io scheduler being none. + * The I/O scheduler depends on the number of hardware queues, this forces a + * reattachment when nr_hw_queues changes. */ -int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +void elv_update_nr_hw_queues(struct request_queue *q) { - int ret; - - lockdep_assert_held(&q->sysfs_lock); - - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - - if (q->elevator) { - elv_unregister_queue(q); - elevator_exit(q); - } + struct elv_change_ctx ctx = {}; + int ret = -ENODEV; - ret = blk_mq_init_sched(q, new_e); - if (ret) - goto out_unfreeze; + WARN_ON_ONCE(q->mq_freeze_depth == 0); - ret = elv_register_queue(q, true); - if (ret) { - elevator_exit(q); - goto out_unfreeze; - } - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + mutex_lock(&q->elevator_lock); + if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) { + ctx.name = q->elevator->type->elevator_name; -out_unfreeze: - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - - if (ret) { - pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", - new_e->elevator_name); + /* force to reattach elevator after nr_hw_queue is updated */ + ret = elevator_switch(q, &ctx); } - - return ret; + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue_nomemrestore(q); + if (!ret) + WARN_ON_ONCE(elevator_change_done(q, &ctx)); } -void elevator_disable(struct request_queue *q) +/* + * Use the default elevator settings. If the chosen elevator initialization + * fails, fall back to the "none" elevator (no elevator). + */ +void elevator_set_default(struct request_queue *q) { - lockdep_assert_held(&q->sysfs_lock); + struct elv_change_ctx ctx = { + .name = "mq-deadline", + .no_uevent = true, + }; + int err = 0; - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + /* now we allow to switch elevator */ + blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q); - elv_unregister_queue(q); - elevator_exit(q); - blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - blk_add_trace_msg(q, "elv switch: none"); + if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return; - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); + /* + * For single queue devices, default to using mq-deadline. If we + * have multiple queues or mq-deadline is not available, default + * to "none". + */ + if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 || + blk_mq_is_shared_tags(q->tag_set->flags))) + err = elevator_change(q, &ctx); + if (err < 0) + pr_warn("\"%s\" elevator initialization, failed %d, " + "falling back to \"none\"\n", ctx.name, err); } -/* - * Switch this queue to the given IO scheduler. - */ -static int elevator_change(struct request_queue *q, const char *elevator_name) +void elevator_set_none(struct request_queue *q) { - struct elevator_type *e; - int ret; + struct elv_change_ctx ctx = { + .name = "none", + }; + int err; - /* Make sure queue is not in the middle of being removed */ - if (!blk_queue_registered(q)) - return -ENOENT; + err = elevator_change(q, &ctx); + if (err < 0) + pr_warn("%s: set none elevator failed %d\n", __func__, err); +} - if (!strncmp(elevator_name, "none", 4)) { - if (q->elevator) - elevator_disable(q); - return 0; - } +static void elv_iosched_load_module(const char *elevator_name) +{ + struct elevator_type *found; - if (q->elevator && elevator_match(q->elevator->type, elevator_name)) - return 0; + spin_lock(&elv_list_lock); + found = __elevator_find(elevator_name); + spin_unlock(&elv_list_lock); - e = elevator_find_get(q, elevator_name); - if (!e) { + if (!found) request_module("%s-iosched", elevator_name); - e = elevator_find_get(q, elevator_name); - if (!e) - return -EINVAL; - } - ret = elevator_switch(q, e); - elevator_put(e); - return ret; } -ssize_t elv_iosched_store(struct request_queue *q, const char *buf, +ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; + struct elv_change_ctx ctx = {}; int ret; + struct request_queue *q = disk->queue; + struct blk_mq_tag_set *set = q->tag_set; - if (!elv_support_iosched(q)) - return count; + /* Make sure queue is not in the middle of being removed */ + if (!blk_queue_registered(q)) + return -ENOENT; + /* + * If the attribute needs to load a module, do it before freezing the + * queue to ensure that the module file can be read when the request + * queue is the one for the device storing the module file. + */ strscpy(elevator_name, buf, sizeof(elevator_name)); - ret = elevator_change(q, strstrip(elevator_name)); - if (!ret) - return count; + ctx.name = strstrip(elevator_name); + + elv_iosched_load_module(ctx.name); + + down_read(&set->update_nr_hwq_lock); + if (!blk_queue_no_elv_switch(q)) { + ret = elevator_change(q, &ctx); + if (!ret) + ret = count; + } else { + ret = -ENOENT; + } + up_read(&set->update_nr_hwq_lock); return ret; } -ssize_t elv_iosched_show(struct request_queue *q, char *name) +ssize_t elv_iosched_show(struct gendisk *disk, char *name) { - struct elevator_queue *eq = q->elevator; + struct request_queue *q = disk->queue; struct elevator_type *cur = NULL, *e; int len = 0; - if (!elv_support_iosched(q)) - return sprintf(name, "none\n"); - + mutex_lock(&q->elevator_lock); if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { len += sprintf(name+len, "none "); - cur = eq->type; + cur = q->elevator->type; } spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { if (e == cur) len += sprintf(name+len, "[%s] ", e->elevator_name); - else if (elv_support_features(q, e)) + else len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); + mutex_unlock(&q->elevator_lock); + return len; } diff --git a/block/elevator.h b/block/elevator.h index 7ca3d7b6ed82..a07ce773a38f 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -71,10 +71,9 @@ struct elevator_type size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ - struct elv_fs_entry *elevator_attrs; + const struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; - const unsigned int elevator_features; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; @@ -122,7 +121,8 @@ struct elevator_queue }; #define ELEVATOR_FLAG_REGISTERED 0 -#define ELEVATOR_FLAG_DISABLE_WBT 1 +#define ELEVATOR_FLAG_DYING 1 +#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2 /* * block elevator interface @@ -148,8 +148,8 @@ extern void elv_unregister(struct elevator_type *); /* * io scheduler sysfs switching */ -extern ssize_t elv_iosched_show(struct request_queue *, char *); -extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); +ssize_t elv_iosched_show(struct gendisk *disk, char *page); +ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, @@ -183,4 +183,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) +void blk_mq_sched_reg_debugfs(struct request_queue *q); +void blk_mq_sched_unreg_debugfs(struct request_queue *q); + #endif /* _ELEVATOR_H */ diff --git a/block/fops.c b/block/fops.c index 679d9b752fe8..1309861d4c2c 100644 --- a/block/fops.c +++ b/block/fops.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/iomap.h> #include <linux/module.h> +#include <linux/io_uring/cmd.h> #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) @@ -34,28 +35,26 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) return opf; } -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, - struct iov_iter *iter) +static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, + struct iov_iter *iter) { - return pos & (bdev_logical_block_size(bdev) - 1) || + return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, - struct iov_iter *iter, unsigned int nr_pages) + struct iov_iter *iter, struct block_device *bdev, + unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - + WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -74,7 +73,10 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio.bi_write_stream = iocb->ki_write_stream; bio.bi_ioprio = iocb->ki_ioprio; + if (iocb->ki_flags & IOCB_ATOMIC) + bio.bi_opf |= REQ_ATOMIC; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) @@ -124,12 +126,16 @@ static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; + bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; + if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + bio_integrity_unmap_user(bio); + if (atomic_dec_and_test(&dio->ref)) { - if (!(dio->flags & DIO_IS_SYNC)) { + if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -161,9 +167,8 @@ static void blkdev_bio_end_io(struct bio *bio) } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; @@ -172,9 +177,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -205,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_stream = iocb->ki_write_stream; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -225,14 +228,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { - bio_release_pages(bio, false); - bio_clear_flag(bio, BIO_REFFED); - bio_put(bio); - blk_finish_plug(&plug); - return -EAGAIN; + ret = -EAGAIN; + goto fail; } bio->bi_opf |= REQ_NOWAIT; } + if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + ret = bio_integrity_map_iter(bio, iocb->private); + if (unlikely(ret)) + goto fail; + } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) @@ -273,6 +278,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_put(&dio->bio); return ret; +fail: + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return ret; } static void blkdev_bio_end_io_async(struct bio *bio) @@ -290,6 +301,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } + if (iocb->ki_flags & IOCB_HAS_METADATA) + bio_integrity_unmap_user(bio); + iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { @@ -302,9 +316,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; @@ -312,9 +326,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -324,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; + bio->bi_write_stream = iocb->ki_write_stream; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; @@ -337,10 +349,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + if (unlikely(ret)) + goto out_bio_put; } dio->size = bio->bi_iter.bi_size; @@ -353,6 +363,16 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_HAS_METADATA) { + ret = bio_integrity_map_iter(bio, iocb->private); + WRITE_ONCE(iocb->private, NULL); + if (unlikely(ret)) + goto out_bio_put; + } + + if (iocb->ki_flags & IOCB_ATOMIC) + bio->bi_opf |= REQ_ATOMIC; + if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; @@ -364,22 +384,53 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, submit_bio(bio); } return -EIOCBQUEUED; + +out_bio_put: + bio_put(bio); + return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; + if (blkdev_dio_invalid(bdev, iocb, iter)) + return -EINVAL; + + if (iov_iter_rw(iter) == WRITE) { + u16 max_write_streams = bdev_max_write_streams(bdev); + + if (iocb->ki_write_stream) { + if (iocb->ki_write_stream > max_write_streams) + return -EINVAL; + } else if (max_write_streams) { + enum rw_hint write_hint = + file_inode(iocb->ki_filp)->i_write_hint; + + /* + * Just use the write hint as write stream for block + * device writes. This assumes no file system is + * mounted that would use the streams differently. + */ + if (write_hint <= max_write_streams) + iocb->ki_write_stream = write_hint; + } + } + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO_async(iocb, iter, nr_pages); + return __blkdev_direct_IO_simple(iocb, iter, bdev, + nr_pages); + return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); + } else if (iocb->ki_flags & IOCB_ATOMIC) { + return -EINVAL; } - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); + return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -388,10 +439,11 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct block_device *bdev = I_BDEV(inode); loff_t isize = i_size_read(inode); + if (offset >= isize) + return -EIO; + iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); - if (iomap->offset >= isize) - return -EIO; iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; @@ -422,12 +474,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct folio *folio = NULL; struct blk_plug plug; int err; blk_start_plug(&plug); - err = write_cache_pages(mapping, wbc, block_write_full_folio, - blkdev_get_block); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) + err = block_write_full_folio(folio, wbc, blkdev_get_block); blk_finish_plug(&plug); return err; @@ -444,20 +497,20 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { - return block_write_begin(mapping, pos, len, pagep, blkdev_get_block); + return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, + loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { int ret; - ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ret; } @@ -613,10 +666,13 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (ret) return ret; - bdev = blkdev_get_no_open(inode->i_rdev); + bdev = blkdev_get_no_open(inode->i_rdev, true); if (!bdev) return -ENXIO; + if (bdev_can_atomic_write(bdev)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev); @@ -655,7 +711,7 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { - return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL); } /* @@ -668,8 +724,9 @@ static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(file->f_mapping->host); - struct inode *bd_inode = bdev->bd_inode; + struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(bd_inode); + bool atomic = iocb->ki_flags & IOCB_ATOMIC; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; ssize_t ret; @@ -689,8 +746,16 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; + if (atomic) { + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + size -= iocb->ki_pos; if (iov_iter_count(from) > size) { + if (atomic) + return -EINVAL; shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } @@ -705,7 +770,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { + /* + * Take i_rwsem and invalidate_lock to avoid racing with + * set_blocksize changing i_blkbits/folio order and punching + * out the pagecache. + */ + inode_lock_shared(bd_inode); ret = blkdev_buffered_write(iocb, from); + inode_unlock_shared(bd_inode); } if (ret > 0) @@ -716,6 +788,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct inode *bd_inode = bdev_file_inode(iocb->ki_filp); struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; @@ -742,16 +815,23 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); - if (ret >= 0) { + if (ret > 0) { iocb->ki_pos += ret; count -= ret; } - iov_iter_revert(to, count - iov_iter_count(to)); + if (ret != -EIOCBQUEUED) + iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) goto reexpand; } + /* + * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize + * changing i_blkbits/folio order and punching out the pagecache. + */ + inode_lock_shared(bd_inode); ret = filemap_read(iocb, to, ret); + inode_unlock_shared(bd_inode); reexpand: if (unlikely(shorted)) @@ -761,7 +841,7 @@ reexpand: #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + FALLOC_FL_ZERO_RANGE) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) @@ -794,6 +874,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; + inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); /* @@ -820,20 +901,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL); - break; default: error = -EOPNOTSUPP; } fail: filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); return error; } @@ -863,6 +937,8 @@ const struct file_operations def_blk_fops = { .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, + .uring_cmd = blkdev_uring_cmd, + .fop_flags = FOP_BUFFER_RASYNC, }; static __init int blkdev_init(void) diff --git a/block/genhd.c b/block/genhd.c index bb29a68e1d67..8171a6bc3210 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -58,6 +58,13 @@ static DEFINE_IDA(ext_devt_ida); void set_capacity(struct gendisk *disk, sector_t sectors) { + if (sectors > BLK_DEV_MAX_SECTORS) { + pr_warn_once("%s: truncate capacity from %lld to %lld\n", + disk->disk_name, sectors, + BLK_DEV_MAX_SECTORS); + sectors = BLK_DEV_MAX_SECTORS; + } + bdev_set_nr_sectors(disk->part0, sectors); } EXPORT_SYMBOL(set_capacity); @@ -118,37 +125,46 @@ static void part_stat_read_all(struct block_device *part, } } -static unsigned int part_in_flight(struct block_device *part) +static void bdev_count_inflight_rw(struct block_device *part, + unsigned int inflight[2], bool mq_driver) { - unsigned int inflight = 0; int cpu; - for_each_possible_cpu(cpu) { - inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + - part_stat_local_read_cpu(part, in_flight[1], cpu); + if (mq_driver) { + blk_mq_in_driver_rw(part, inflight); + } else { + for_each_possible_cpu(cpu) { + inflight[READ] += part_stat_local_read_cpu( + part, in_flight[READ], cpu); + inflight[WRITE] += part_stat_local_read_cpu( + part, in_flight[WRITE], cpu); + } } - if ((int)inflight < 0) - inflight = 0; - return inflight; + if (WARN_ON_ONCE((int)inflight[READ] < 0)) + inflight[READ] = 0; + if (WARN_ON_ONCE((int)inflight[WRITE] < 0)) + inflight[WRITE] = 0; } -static void part_in_flight_rw(struct block_device *part, - unsigned int inflight[2]) +/** + * bdev_count_inflight - get the number of inflight IOs for a block device. + * + * @part: the block device. + * + * Inflight here means started IO accounting, from bdev_start_io_acct() for + * bio-based block device, and from blk_account_io_start() for rq-based block + * device. + */ +unsigned int bdev_count_inflight(struct block_device *part) { - int cpu; + unsigned int inflight[2] = {0}; - inflight[0] = 0; - inflight[1] = 0; - for_each_possible_cpu(cpu) { - inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); - inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); - } - if ((int)inflight[0] < 0) - inflight[0] = 0; - if ((int)inflight[1] < 0) - inflight[1] = 0; + bdev_count_inflight_rw(part, inflight, false); + + return inflight[READ] + inflight[WRITE]; } +EXPORT_SYMBOL_GPL(bdev_count_inflight); /* * Can be deleted altogether. Later. @@ -345,9 +361,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) struct file *file; int ret = 0; - if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) - return -EINVAL; - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; @@ -384,36 +398,54 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) return ret; } -/** - * device_add_disk - add disk information to kernel list - * @parent: parent device for the disk - * @disk: per-device partitioning information - * @groups: Additional per-device sysfs groups - * - * This function registers the partitioning information in @disk - * with the kernel. - */ -int __must_check device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) +static void add_disk_final(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + /* Make sure the first partition scan will be proceed */ + if (get_capacity(disk) && disk_has_partscan(disk)) + set_bit(GD_NEED_PART_SCAN, &disk->state); + + bdev_add(disk->part0, ddev->devt); + if (get_capacity(disk)) + disk_scan_partitions(disk, BLK_OPEN_READ); + + /* + * Announce the disk and partitions after all partitions are + * created. (for hidden disks uevents remain suppressed forever) + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); + } + + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); + disk_add_events(disk); + set_bit(GD_ADDED, &disk->state); +} + +static int __add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); int ret; - /* Only makes sense for bio-based to set ->poll_bio */ - if (queue_is_mq(disk->queue) && disk->fops->poll_bio) + if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS)) return -EINVAL; - /* - * The disk queue should now be all set with enough information about - * the device for the elevator code to pick an adequate default - * elevator if one is needed, that is, for devices requesting queue - * registration. - */ - elevator_init_mq(disk->queue); - - /* Mark bdev as having a submit_bio, if needed */ - disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL; + if (queue_is_mq(disk->queue)) { + /* + * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers. + */ + if (disk->fops->submit_bio || disk->fops->poll_bio) + return -EINVAL; + } else { + if (!disk->fops->submit_bio) + return -EINVAL; + bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO); + } /* * If the driver provides an explicit major number it also must provide @@ -425,7 +457,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) - goto out_exit_elevator; + goto out; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -435,14 +467,14 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, if (disk->first_minor > MINORMASK || disk->minors > MINORMASK + 1 || disk->first_minor + disk->minors > MINORMASK + 1) - goto out_exit_elevator; + goto out; } else { if (WARN_ON(disk->minors)) - goto out_exit_elevator; + goto out; ret = blk_alloc_ext_minor(); if (ret < 0) - goto out_exit_elevator; + goto out; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } @@ -453,6 +485,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); + if (fwnode) + device_set_node(ddev, fwnode); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); @@ -501,22 +535,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, &disk->bdi->dev->kobj, "bdi"); if (ret) goto out_unregister_bdi; - - /* Make sure the first partition scan will be proceed */ - if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) && - !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) - set_bit(GD_NEED_PART_SCAN, &disk->state); - - bdev_add(disk->part0, ddev->devt); - if (get_capacity(disk)) - disk_scan_partitions(disk, BLK_OPEN_READ); - - /* - * Announce the disk and partitions after all partitions are - * created. (for hidden disks uevents remain suppressed forever) - */ - dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); } else { /* * Even if the block_device for a hidden gendisk is not @@ -525,10 +543,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } - - disk_update_readahead(disk); - disk_add_events(disk); - set_bit(GD_ADDED, &disk->state); return 0; out_unregister_bdi: @@ -550,11 +564,64 @@ out_device_del: out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); -out_exit_elevator: - if (disk->queue->elevator) - elevator_exit(disk->queue); +out: + return ret; +} + +/** + * add_disk_fwnode - add disk information to kernel list with fwnode + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * @fwnode: attached disk fwnode + * + * This function registers the partitioning information in @disk + * with the kernel. Also attach a fwnode to the disk device. + */ +int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + int ret; + + if (queue_is_mq(disk->queue)) { + set = disk->queue->tag_set; + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + ret = __add_disk(parent, disk, groups, fwnode); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); + } else { + ret = __add_disk(parent, disk, groups, fwnode); + } + + /* + * add_disk_final() needn't to read `nr_hw_queues`, so move it out + * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary + * lock dependency on `disk->open_mutex` from scanning partition. + */ + if (!ret) + add_disk_final(disk); return ret; } +EXPORT_SYMBOL_GPL(add_disk_fwnode); + +/** + * device_add_disk - add disk information to kernel list + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * + * This function registers the partitioning information in @disk + * with the kernel. + */ +int __must_check device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) +{ + return add_disk_fwnode(parent, disk, groups, NULL); +} EXPORT_SYMBOL(device_add_disk); static void blk_report_disk_dead(struct gendisk *disk, bool surprise) @@ -583,13 +650,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise) rcu_read_unlock(); } -static void __blk_mark_disk_dead(struct gendisk *disk) +static bool __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) - return; + return false; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); @@ -602,7 +669,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk) /* * Prevent new I/O from crossing bio_queue_enter(). */ - blk_queue_start_drain(disk->queue); + return blk_queue_start_drain(disk->queue); } /** @@ -619,30 +686,12 @@ void blk_mark_disk_dead(struct gendisk *disk) } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); -/** - * del_gendisk - remove the gendisk - * @disk: the struct gendisk to remove - * - * Removes the gendisk and all its associated resources. This deletes the - * partitions associated with the gendisk, and unregisters the associated - * request_queue. - * - * This is the counter to the respective __device_add_disk() call. - * - * The final removal of the struct gendisk happens when its refcount reaches 0 - * with put_disk(), which should be called after del_gendisk(), if - * __device_add_disk() was used. - * - * Drivers exist which depend on the release of the gendisk to be synchronous, - * it should not be deferred. - * - * Context: can sleep - */ -void del_gendisk(struct gendisk *disk) +static void __del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; + bool start_drain; might_sleep(); @@ -656,7 +705,7 @@ void del_gendisk(struct gendisk *disk) */ mutex_lock(&disk->open_mutex); xa_for_each(&disk->part_tbl, idx, part) - remove_inode_hash(part->bd_inode); + bdev_unhash(part); mutex_unlock(&disk->open_mutex); /* @@ -665,12 +714,14 @@ void del_gendisk(struct gendisk *disk) */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); - __blk_mark_disk_dead(disk); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); + start_drain = __blk_mark_disk_dead(disk); + if (start_drain) + blk_freeze_acquire_lock(q); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); @@ -707,25 +758,67 @@ void del_gendisk(struct gendisk *disk) if (queue_is_mq(q)) blk_mq_cancel_work_sync(q); - blk_mq_quiesce_queue(q); - if (q->elevator) { - mutex_lock(&q->sysfs_lock); - elevator_exit(q); - mutex_unlock(&q->sysfs_lock); - } rq_qos_exit(q); - blk_mq_unquiesce_queue(q); /* * If the disk does not own the queue, allow using passthrough requests * again. Else leave the queue frozen to fail all I/O. */ - if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { - blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + if (!test_bit(GD_OWNS_QUEUE, &disk->state)) __blk_mq_unfreeze_queue(q, true); + else if (queue_is_mq(q)) + blk_mq_exit_queue(q); + + if (start_drain) + blk_unfreeze_release_lock(q); +} + +static void disable_elv_switch(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + WARN_ON_ONCE(!queue_is_mq(q)); + + down_write(&set->update_nr_hwq_lock); + blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q); + up_write(&set->update_nr_hwq_lock); +} + +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ +void del_gendisk(struct gendisk *disk) +{ + struct blk_mq_tag_set *set; + unsigned int memflags; + + if (!queue_is_mq(disk->queue)) { + __del_gendisk(disk); } else { - if (queue_is_mq(q)) - blk_mq_exit_queue(q); + set = disk->queue->tag_set; + + disable_elv_switch(disk->queue); + + memflags = memalloc_noio_save(); + down_read(&set->update_nr_hwq_lock); + __del_gendisk(disk); + up_read(&set->update_nr_hwq_lock); + memalloc_noio_restore(memflags); } } EXPORT_SYMBOL(del_gendisk); @@ -745,7 +838,7 @@ void invalidate_disk(struct gendisk *disk) struct block_device *bdev = disk->part0; invalidate_bdev(bdev); - bdev->bd_inode->i_mapping->wb_err = 0; + bdev->bd_mapping->wb_err = 0; set_capacity(disk, 0); } EXPORT_SYMBOL(invalidate_disk); @@ -758,7 +851,7 @@ static ssize_t disk_badblocks_show(struct device *dev, struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) - return sprintf(page, "\n"); + return sysfs_emit(page, "\n"); return badblocks_show(disk->bb, page, 0); } @@ -776,7 +869,7 @@ static ssize_t disk_badblocks_store(struct device *dev, } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD -void blk_request_module(dev_t devt) +static bool blk_probe_dev(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; @@ -786,14 +879,26 @@ void blk_request_module(dev_t devt) if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); - return; + return true; } } mutex_unlock(&major_names_lock); + return false; +} + +void blk_request_module(dev_t devt) +{ + int error; - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); + if (blk_probe_dev(devt)) + return; + + error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)); + /* Make old-style 2.4 aliases work */ + if (error > 0) + error = request_module("block-major-%d", MAJOR(devt)); + if (!error) + blk_probe_dev(devt); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ @@ -906,7 +1011,7 @@ static ssize_t disk_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk->minors); + return sysfs_emit(buf, "%d\n", disk->minors); } static ssize_t disk_ext_range_show(struct device *dev, @@ -914,7 +1019,7 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } @@ -923,7 +1028,7 @@ static ssize_t disk_removable_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } @@ -932,7 +1037,7 @@ static ssize_t disk_hidden_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", + return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); } @@ -941,35 +1046,30 @@ static ssize_t disk_ro_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); + return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); + return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); - struct request_queue *q = bdev_get_queue(bdev); struct disk_stats stat; unsigned int inflight; - if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, bdev); - else - inflight = part_in_flight(bdev); - + inflight = bdev_count_inflight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); part_stat_unlock(); } part_stat_read_all(bdev, &stat); - return sprintf(buf, + return sysfs_emit(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " @@ -999,26 +1099,28 @@ ssize_t part_stat_show(struct device *dev, (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } +/* + * Show the number of IOs issued to driver. + * For bio-based device, started from bdev_start_io_acct(); + * For rq-based device, started from blk_mq_start_request(); + */ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); - unsigned int inflight[2]; + unsigned int inflight[2] = {0}; - if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, bdev, inflight); - else - part_in_flight_rw(bdev, inflight); + bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q)); - return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); + return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]); } static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the capability attribute has been deprecated.\n"); - return sprintf(buf, "0\n"); + return sysfs_emit(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, @@ -1027,7 +1129,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, @@ -1036,7 +1138,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); + return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, @@ -1044,7 +1146,13 @@ static ssize_t diskseq_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%llu\n", disk->diskseq); + return sysfs_emit(buf, "%llu\n", disk->diskseq); +} + +static ssize_t partscan_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); @@ -1060,12 +1168,14 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); +static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); + return sysfs_emit(buf, "%d\n", + bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL)); } ssize_t part_fail_store(struct device *dev, @@ -1074,9 +1184,12 @@ ssize_t part_fail_store(struct device *dev, { int i; - if (count > 0 && sscanf(buf, "%d", &i) > 0) - dev_to_bdev(dev)->bd_make_it_fail = i; - + if (count > 0 && sscanf(buf, "%d", &i) > 0) { + if (i) + bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); + else + bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); + } return count; } @@ -1106,6 +1219,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, + &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1182,7 +1296,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); - disk_free_zone_bitmaps(disk); + disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; @@ -1191,7 +1305,7 @@ static void disk_release(struct device *dev) if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk) disk->fops->free_disk(disk); - iput(disk->part0->bd_inode); /* frees the disk */ + bdev_drop(disk->part0); /* frees the disk */ } static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) @@ -1251,51 +1365,43 @@ static int diskstats_show(struct seq_file *seqf, void *v) xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); - else - inflight = part_in_flight(hd); + inflight = bdev_count_inflight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); part_stat_unlock(); } part_stat_read_all(hd, &stat); - seq_printf(seqf, "%4d %7d %pg " - "%lu %lu %lu %u " - "%lu %lu %lu %u " - "%u %u %u " - "%lu %lu %lu %u " - "%lu %u" - "\n", - MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, - stat.ios[STAT_READ], - stat.merges[STAT_READ], - stat.sectors[STAT_READ], - (unsigned int)div_u64(stat.nsecs[STAT_READ], - NSEC_PER_MSEC), - stat.ios[STAT_WRITE], - stat.merges[STAT_WRITE], - stat.sectors[STAT_WRITE], - (unsigned int)div_u64(stat.nsecs[STAT_WRITE], - NSEC_PER_MSEC), - inflight, - jiffies_to_msecs(stat.io_ticks), - (unsigned int)div_u64(stat.nsecs[STAT_READ] + - stat.nsecs[STAT_WRITE] + - stat.nsecs[STAT_DISCARD] + - stat.nsecs[STAT_FLUSH], - NSEC_PER_MSEC), - stat.ios[STAT_DISCARD], - stat.merges[STAT_DISCARD], - stat.sectors[STAT_DISCARD], - (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], - NSEC_PER_MSEC), - stat.ios[STAT_FLUSH], - (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], - NSEC_PER_MSEC) - ); + seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4); + seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7); + seq_printf(seqf, " %pg", hd); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]); + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]); + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]); + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]); + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", inflight); + seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks)); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]); + seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]); + seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC)); + seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); + seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC)); + seq_putc(seqf, '\n'); } rcu_read_unlock(); @@ -1364,6 +1470,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (blkcg_init_disk(disk)) goto out_erase_part0; + disk_init_zone_resources(disk); rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1374,6 +1481,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif + mutex_init(&disk->rqos_state_mutex); return disk; out_erase_part0: @@ -1381,7 +1489,7 @@ out_erase_part0: out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; - iput(disk->part0->bd_inode); + bdev_drop(disk->part0); out_free_bdi: bdi_put(disk->bdi); out_free_bioset: diff --git a/block/ioctl.c b/block/ioctl.c index f505f9c341eb..e472cc1030c6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -11,7 +11,11 @@ #include <linux/blktrace_api.h> #include <linux/pr.h> #include <linux/uaccess.h> +#include <linux/pagemap.h> +#include <linux/io_uring/cmd.h> +#include <uapi/linux/blkdev.h> #include "blk.h" +#include "blk-crypto-internal.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) @@ -33,7 +37,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); - if (p.start < 0 || p.length <= 0 || p.start + p.length < 0) + if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) return -EINVAL; /* Check that the partition is aligned to the block size */ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) @@ -92,49 +96,93 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif +/* + * Check that [start, start + len) is a valid range from the block device's + * perspective, including verifying that it can be correctly translated into + * logical block addresses. + */ +static int blk_validate_byte_range(struct block_device *bdev, + uint64_t start, uint64_t len) +{ + unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; + uint64_t end; + + if ((start | len) & bs_mask) + return -EINVAL; + if (!len) + return -EINVAL; + if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev)) + return -EINVAL; + + return 0; +} + static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { - uint64_t range[2]; - uint64_t start, len, end; - struct inode *inode = bdev->bd_inode; + uint64_t range[2], start, len; + struct bio *prev = NULL, *bio; + sector_t sector, nr_sects; + struct blk_plug plug; int err; - if (!(mode & BLK_OPEN_WRITE)) - return -EBADF; - - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; - if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; - start = range[0]; len = range[1]; - if (start & 511) - return -EINVAL; - if (len & 511) - return -EINVAL; + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; - if (check_add_overflow(start, len, &end) || - end > bdev_nr_bytes(bdev)) - return -EINVAL; + if (!(mode & BLK_OPEN_WRITE)) + return -EBADF; + if (bdev_read_only(bdev)) + return -EPERM; + err = blk_validate_byte_range(bdev, start, len); + if (err) + return err; - filemap_invalidate_lock(inode->i_mapping); + inode_lock(bdev->bd_mapping->host); + filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + + sector = start >> SECTOR_SHIFT; + nr_sects = len >> SECTOR_SHIFT; + + blk_start_plug(&plug); + while (1) { + if (fatal_signal_pending(current)) { + if (prev) + bio_await_chain(prev); + err = -EINTR; + goto out_unplug; + } + bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, + GFP_KERNEL); + if (!bio) + break; + prev = bio_chain_and_submit(prev, bio); + } + if (prev) { + err = submit_bio_wait(prev); + if (err == -EOPNOTSUPP) + err = 0; + bio_put(prev); + } +out_unplug: + blk_finish_plug(&plug); fail: - filemap_invalidate_unlock(inode->i_mapping); + filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { - uint64_t start, len; + uint64_t start, len, end; uint64_t range[2]; int err; @@ -149,15 +197,18 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, len = range[1]; if ((start & 511) || (len & 511)) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; - filemap_invalidate_lock(bdev->bd_inode->i_mapping); - err = truncate_bdev_range(bdev, mode, start, start + len - 1); + inode_lock(bdev->bd_mapping->host); + filemap_invalidate_lock(bdev->bd_mapping); + err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); - filemap_invalidate_unlock(bdev->bd_inode->i_mapping); + filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -167,7 +218,6 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, { uint64_t range[2]; uint64_t start, end, len; - struct inode *inode = bdev->bd_inode; int err; if (!(mode & BLK_OPEN_WRITE)) @@ -190,16 +240,18 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, return -EINVAL; /* Invalidate the page cache, including dirty pages */ - filemap_invalidate_lock(inode->i_mapping); + inode_lock(bdev->bd_mapping->host); + filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end); if (err) goto fail; err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, - BLKDEV_ZERO_NOUNMAP); + BLKDEV_ZERO_NOUNMAP | BLKDEV_ZERO_KILLABLE); fail: - filemap_invalidate_unlock(inode->i_mapping); + filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -403,7 +455,10 @@ static int blkdev_roset(struct block_device *bdev, unsigned cmd, if (ret) return ret; } - bdev->bd_read_only = n; + if (n) + bdev_set_flag(bdev, BD_READ_ONLY); + else + bdev_clear_flag(bdev, BD_READ_ONLY); return 0; } @@ -473,11 +528,14 @@ static int compat_hdio_getgeo(struct block_device *bdev, #endif /* set the logical block size */ -static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, +static int blkdev_bszset(struct file *file, blk_mode_t mode, int __user *argp) { + // this one might be file_inode(file)->i_rdev - a rare valid + // use of file_inode() for those. + dev_t dev = I_BDEV(file->f_mapping->host)->bd_dev; + struct file *excl_file; int ret, n; - struct file *file; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -487,13 +545,13 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, return -EFAULT; if (mode & BLK_OPEN_EXCL) - return set_blocksize(bdev, n); + return set_blocksize(file, n); - file = bdev_file_open_by_dev(bdev->bd_dev, mode, &bdev, NULL); - if (IS_ERR(file)) + excl_file = bdev_file_open_by_dev(dev, mode, &dev, NULL); + if (IS_ERR(excl_file)) return -EBUSY; - ret = set_blocksize(bdev, n); - fput(file); + ret = set_blocksize(excl_file, n); + fput(excl_file); return ret; } @@ -569,6 +627,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKCRYPTOIMPORTKEY: + case BLKCRYPTOGENERATEKEY: + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: @@ -622,7 +684,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ return put_int(argp, block_size(bdev)); case BLKBSZSET: - return blkdev_bszset(bdev, mode, argp); + return blkdev_bszset(file, mode, argp); case BLKGETSIZE64: return put_u64(argp, bdev_nr_bytes(bdev)); @@ -682,7 +744,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ return put_int(argp, bdev_logical_block_size(bdev)); case BLKBSZSET_32: - return blkdev_bszset(bdev, mode, argp); + return blkdev_bszset(file, mode, argp); case BLKGETSIZE64_32: return put_u64(argp, bdev_nr_bytes(bdev)); @@ -700,3 +762,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return ret; } #endif + +struct blk_iou_cmd { + int res; + bool nowait; +}; + +static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + + if (bic->res == -EAGAIN && bic->nowait) + io_uring_cmd_issue_blocking(cmd); + else + io_uring_cmd_done(cmd, bic->res, 0, issue_flags); +} + +static void bio_cmd_bio_end_io(struct bio *bio) +{ + struct io_uring_cmd *cmd = bio->bi_private; + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + + if (unlikely(bio->bi_status) && !bic->res) + bic->res = blk_status_to_errno(bio->bi_status); + + io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete); + bio_put(bio); +} + +static int blkdev_cmd_discard(struct io_uring_cmd *cmd, + struct block_device *bdev, + uint64_t start, uint64_t len, bool nowait) +{ + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL; + sector_t sector = start >> SECTOR_SHIFT; + sector_t nr_sects = len >> SECTOR_SHIFT; + struct bio *prev = NULL, *bio; + int err; + + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; + if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE)) + return -EBADF; + if (bdev_read_only(bdev)) + return -EPERM; + err = blk_validate_byte_range(bdev, start, len); + if (err) + return err; + + err = filemap_invalidate_pages(bdev->bd_mapping, start, + start + len - 1, nowait); + if (err) + return err; + + while (true) { + bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, gfp); + if (!bio) + break; + if (nowait) { + /* + * Don't allow multi-bio non-blocking submissions as + * subsequent bios may fail but we won't get a direct + * indication of that. Normally, the caller should + * retry from a blocking context. + */ + if (unlikely(nr_sects)) { + bio_put(bio); + return -EAGAIN; + } + bio->bi_opf |= REQ_NOWAIT; + } + + prev = bio_chain_and_submit(prev, bio); + } + if (unlikely(!prev)) + return -EAGAIN; + if (unlikely(nr_sects)) + bic->res = -EAGAIN; + + prev->bi_private = cmd; + prev->bi_end_io = bio_cmd_bio_end_io; + submit_bio(prev); + return -EIOCBQUEUED; +} + +int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host); + struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); + const struct io_uring_sqe *sqe = cmd->sqe; + u32 cmd_op = cmd->cmd_op; + uint64_t start, len; + + if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len || + sqe->rw_flags || sqe->file_index)) + return -EINVAL; + + bic->res = 0; + bic->nowait = issue_flags & IO_URING_F_NONBLOCK; + + start = READ_ONCE(sqe->addr); + len = READ_ONCE(sqe->addr3); + + switch (cmd_op) { + case BLOCK_URING_CMD_DISCARD: + return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait); + } + return -EINVAL; +} diff --git a/block/ioprio.c b/block/ioprio.c index 73301a261429..f0ee2798539c 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -46,12 +46,8 @@ int ioprio_check_cap(int ioprio) */ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) return -EPERM; - fallthrough; - /* rt has prio field too */ - case IOPRIO_CLASS_BE: - if (level >= IOPRIO_NR_LEVELS) - return -EINVAL; break; + case IOPRIO_CLASS_BE: case IOPRIO_CLASS_IDLE: break; case IOPRIO_CLASS_NONE: diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4155594aefc6..0f0f8452609a 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -568,7 +568,7 @@ static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx); struct kyber_hctx_data *khd = hctx->sched_data; struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); @@ -889,7 +889,7 @@ KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); #undef KYBER_LAT_SHOW_STORE #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) -static struct elv_fs_entry kyber_sched_attrs[] = { +static const struct elv_fs_entry kyber_sched_attrs[] = { KYBER_LAT_ATTR(read), KYBER_LAT_ATTR(write), __ATTR_NULL diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 02a916ba62ee..2edf1cac06d5 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -102,7 +102,6 @@ struct deadline_data { int prio_aging_expire; spinlock_t lock; - spinlock_t zone_lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ @@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq) } /* - * get the request before `rq' in sector-sorted order - */ -static inline struct request * -deadline_earlier_request(struct request *rq) -{ - struct rb_node *node = rb_prev(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -/* - * get the request after `rq' in sector-sorted order - */ -static inline struct request * -deadline_latter_request(struct request *rq) -{ - struct rb_node *node = rb_next(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -/* - * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, - * return the first request after the start of the zone containing @pos. + * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) @@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, return NULL; rq = rb_entry_rq(node); - /* - * A zoned write may have been requeued with a starting position that - * is below that of the most recently dispatched request. Hence, for - * zoned writes, start searching from the start of a zone. - */ - if (blk_rq_is_seq_zoned_write(rq)) - pos = round_down(pos, rq->q->limits.chunk_sectors); - while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { @@ -309,36 +271,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, } /* - * Check if rq has a sequential request preceding it. - */ -static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) -{ - struct request *prev = deadline_earlier_request(rq); - - if (!prev) - return false; - - return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); -} - -/* - * Skip all write requests that are sequential from @rq, even if we cross - * a zone boundary. - */ -static struct request *deadline_skip_seq_writes(struct deadline_data *dd, - struct request *rq) -{ - sector_t pos = blk_rq_pos(rq); - - do { - pos += blk_rq_sectors(rq); - rq = deadline_latter_request(rq); - } while (rq && blk_rq_pos(rq) == pos); - - return rq; -} - -/* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ @@ -346,40 +278,10 @@ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq, *rb_rq, *next; - unsigned long flags; - if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; - rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. For some HDDs, breaking a sequential - * write stream can lead to lower throughput, so make sure to preserve - * sequential write streams, even if that stream crosses into the next - * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], - queuelist) { - /* Check whether a prior request exists for the same zone. */ - rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); - if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) - rq = rb_rq; - if (blk_req_can_dispatch_to_zone(rq) && - (blk_queue_nonrot(rq->q) || - !deadline_is_seq_write(dd, rq))) - goto out; - } - rq = NULL; -out: - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; + return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* @@ -390,36 +292,8 @@ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq; - unsigned long flags; - - rq = deadline_from_pos(per_prio, data_dir, - per_prio->latest_pos[data_dir]); - if (!rq) - return NULL; - - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. For some HDDs, breaking a sequential - * write stream can lead to lower throughput, so make sure to preserve - * sequential write streams, even if that stream crosses into the next - * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - break; - if (blk_queue_nonrot(rq->q)) - rq = deadline_latter_request(rq); - else - rq = deadline_skip_seq_writes(dd, rq); - } - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; + return deadline_from_pos(per_prio, data_dir, + per_prio->latest_pos[data_dir]); } /* @@ -525,10 +399,6 @@ dispatch_find_request: rq = next_rq; } - /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. - */ if (!rq) return NULL; @@ -549,10 +419,6 @@ done: prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; - /* - * If the request needs its target zone locked, do it. - */ - blk_req_zone_write_lock(rq); rq->rq_flags |= RQF_STARTED; return rq; } @@ -622,6 +488,20 @@ unlock: } /* + * 'depth' is a number in the range 1..INT_MAX representing a number of + * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since + * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). + * Values larger than q->nr_requests have the same effect as q->nr_requests. + */ +static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) +{ + struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; + const unsigned int nrr = hctx->queue->nr_requests; + + return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; +} + +/* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ @@ -637,7 +517,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ - data->shallow_depth = dd->async_depth; + data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); } /* Called by blk_mq_update_nr_requests(). */ @@ -647,9 +527,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = q->nr_requests; - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ @@ -722,7 +602,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); - spin_lock_init(&dd->zone_lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); @@ -804,18 +683,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, lockdep_assert_held(&dd->lock); - /* - * This may be a requeue of a write request that has locked its - * target zone. If it is the case, this releases the zone lock. - */ - blk_req_zone_write_unlock(rq); - prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; - if (!rq->elv.priv[0]) { + if (!rq->elv.priv[0]) per_prio->stats.inserted++; - rq->elv.priv[0] = (void *)(uintptr_t)1; - } + rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; @@ -826,8 +698,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { - struct list_head *insert_before; - deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { @@ -840,25 +710,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - insert_before = &per_prio->fifo_list[data_dir]; -#ifdef CONFIG_BLK_DEV_ZONED - /* - * Insert zoned writes such that requests are sorted by - * position per zone. - */ - if (blk_rq_is_seq_zoned_write(rq)) { - struct request *rq2 = deadline_latter_request(rq); - - if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) - insert_before = &rq2->queuelist; - } -#endif - list_add_tail(&rq->queuelist, insert_before); + list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } /* - * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). + * Called from blk_mq_insert_request() or blk_mq_dispatch_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, @@ -887,62 +744,20 @@ static void dd_prepare_request(struct request *rq) rq->elv.priv[0] = NULL; } -static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) -{ - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - enum dd_prio p; - - for (p = 0; p <= DD_PRIO_MAX; p++) - if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) - return true; - - return false; -} - /* * Callback from inside blk_mq_free_request(). - * - * For zoned block devices, write unlock the target zone of - * completed write requests. Do this while holding the zone lock - * spinlock so that the zone is never unlocked while deadline_fifo_request() - * or deadline_next_request() are executing. This function is called for - * all requests, whether or not these requests complete successfully. - * - * For a zoned block device, __dd_dispatch_request() may have stopped - * dispatching requests if all the queued requests are write requests directed - * at zones that are already locked due to on-going write requests. To ensure - * write request dispatch progress in this case, mark the queue as needing a - * restart to ensure that the queue is run again after completion of the - * request and zones being unlocked. */ static void dd_finish_request(struct request *rq) { - struct request_queue *q = rq->q; - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(rq); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; + struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ - if (!rq->elv.priv[0]) - return; - - atomic_inc(&per_prio->stats.completed); - - if (blk_queue_is_zoned(q)) { - unsigned long flags; - - spin_lock_irqsave(&dd->zone_lock, flags); - blk_req_zone_write_unlock(rq); - spin_unlock_irqrestore(&dd->zone_lock, flags); - - if (dd_has_write_work(rq->mq_hctx)) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - } + if (per_prio) + atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) @@ -1019,7 +834,7 @@ STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) -static struct elv_fs_entry deadline_attrs[] = { +static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), @@ -1266,7 +1081,6 @@ static struct elevator_type mq_deadline = { .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", - .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 7aff4eb81c60..ce17e41451af 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -270,4 +270,13 @@ config CMDLINE_PARTITION Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. +config OF_PARTITION + bool "Device Tree partition support" if PARTITION_ADVANCED + depends on OF + help + Say Y here if you want to enable support for partition table + defined in Device Tree. (mainly for eMMC) + The format for the device tree node is just like MTD fixed-partition + schema. + endmenu diff --git a/block/partitions/Makefile b/block/partitions/Makefile index a7f05cdb02a8..25d424922c6e 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o +obj-$(CONFIG_OF_PARTITION) += of.o obj-$(CONFIG_OSF_PARTITION) += osf.o obj-$(CONFIG_SGI_PARTITION) += sgi.o obj-$(CONFIG_SUN_PARTITION) += sun.o diff --git a/block/partitions/check.h b/block/partitions/check.h index 8d70a880c372..e5c1c61eb353 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -62,6 +62,7 @@ int karma_partition(struct parsed_partitions *state); int ldm_partition(struct parsed_partitions *state); int mac_partition(struct parsed_partitions *state); int msdos_partition(struct parsed_partitions *state); +int of_partition(struct parsed_partitions *state); int osf_partition(struct parsed_partitions *state); int sgi_partition(struct parsed_partitions *state); int sun_partition(struct parsed_partitions *state); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index c03bc105e575..da3e719d8e51 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -70,8 +70,8 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) } if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); + partdef++; + char *next = strsep(&partdef, ")"); if (!next) { pr_warn("cmdline partition format is invalid."); @@ -79,11 +79,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) goto fail; } - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strscpy(new_subpart->name, partdef, length); - - partdef = ++next; + strscpy(new_subpart->name, next, sizeof(new_subpart->name)); } else new_subpart->name[0] = '\0'; @@ -117,14 +113,12 @@ static void free_subpart(struct cmdline_parts *parts) } } -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +static int parse_parts(struct cmdline_parts **parts, char *bdevdef) { int ret = -EINVAL; char *next; - int length; struct cmdline_subpart **next_subpart; struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; *parts = NULL; @@ -132,28 +126,19 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) if (!newparts) return -ENOMEM; - next = strchr(bdevdef, ':'); + next = strsep(&bdevdef, ":"); if (!next) { pr_warn("cmdline partition has no block device."); goto fail; } - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strscpy(newparts->name, bdevdef, length); + strscpy(newparts->name, next, sizeof(newparts->name)); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strscpy(buf, bdevdef, length); - - ret = parse_subpart(next_subpart, buf); + while ((next = strsep(&bdevdef, ","))) { + ret = parse_subpart(next_subpart, next); if (ret) goto fail; @@ -199,24 +184,17 @@ static int cmdline_parts_parse(struct cmdline_parts **parts, *parts = NULL; - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + pbuf = buf = kstrdup(cmdline, GFP_KERNEL); if (!buf) return -ENOMEM; next_parts = parts; - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); + while ((next = strsep(&pbuf, ";"))) { + ret = parse_parts(next_parts, next); if (ret) goto fail; - if (next) - pbuf = ++next; - next_parts = &(*next_parts)->next_parts; } @@ -250,7 +228,6 @@ static struct cmdline_parts *bdev_parts; static int add_part(int slot, struct cmdline_subpart *subpart, struct parsed_partitions *state) { - int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; @@ -260,11 +237,12 @@ static int add_part(int slot, struct cmdline_subpart *subpart, put_partition(state, slot, subpart->from >> 9, subpart->size >> 9); + if (subpart->flags & PF_RDONLY) + state->parts[slot].flags |= ADDPART_FLAG_READONLY; + info = &state->parts[slot].info; - label_min = min_t(int, sizeof(info->volname) - 1, - sizeof(subpart->name)); - strscpy(info->volname, subpart->name, label_min); + strscpy(info->volname, subpart->name, sizeof(info->volname)); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); diff --git a/block/partitions/core.c b/block/partitions/core.c index b11e88c82c8c..815ed33caa1b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -43,6 +43,9 @@ static int (*const check_part[])(struct parsed_partitions *) = { #ifdef CONFIG_CMDLINE_PARTITION cmdline_partition, #endif +#ifdef CONFIG_OF_PARTITION + of_partition, /* cmdline have priority to OF */ +#endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif @@ -173,7 +176,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); + return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); } static ssize_t part_start_show(struct device *dev, @@ -243,16 +246,18 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { put_disk(dev_to_bdev(dev)->bd_disk); - iput(dev_to_bdev(dev)->bd_inode); + bdev_drop(dev_to_bdev(dev)); } static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct block_device *part = dev_to_bdev(dev); - add_uevent_var(env, "PARTN=%u", part->bd_partno); + add_uevent_var(env, "PARTN=%u", bdev_partno(part)); if (part->bd_meta_info && part->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); + if (part->bd_meta_info && part->bd_meta_info->uuid[0]) + add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid); return 0; } @@ -267,7 +272,7 @@ void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); - xa_erase(&part->bd_disk->part_tbl, part->bd_partno); + xa_erase(&part->bd_disk->part_tbl, bdev_partno(part)); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); @@ -338,8 +343,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, pdev->parent = ddev; /* in consecutive minor range? */ - if (bdev->bd_partno < disk->minors) { - devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); + if (bdev_partno(bdev) < disk->minors) { + devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev)); } else { err = blk_alloc_ext_minor(); if (err < 0) @@ -373,6 +378,9 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, goto out_del; } + if (flags & ADDPART_FLAG_READONLY) + bdev_set_flag(bdev, BD_READ_ONLY); + /* everything is up and running, commence */ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); if (err) @@ -404,7 +412,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, rcu_read_lock(); xa_for_each_start(&disk->part_tbl, idx, part, 1) { - if (part->bd_partno != skip_partno && + if (bdev_partno(part) != skip_partno && start < part->bd_start_sect + bdev_nr_sectors(part) && start + length > part->bd_start_sect) { overlap = true; @@ -469,7 +477,7 @@ int bdev_del_partition(struct gendisk *disk, int partno) * Just delete the partition and invalidate it. */ - remove_inode_hash(part->bd_inode); + bdev_unhash(part); invalidate_bdev(part); drop_partition(part); ret = 0; @@ -555,9 +563,11 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); - if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %pe\n", - disk->disk_name, p, part); + if (IS_ERR(part)) { + if (PTR_ERR(part) != -ENXIO) { + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); + } return true; } @@ -573,10 +583,7 @@ static int blk_add_partitions(struct gendisk *disk) struct parsed_partitions *state; int ret = -EAGAIN, p; - if (disk->flags & GENHD_FL_NO_PART) - return 0; - - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); @@ -655,7 +662,7 @@ rescan: * it cannot be looked up any more even when openers * still hold references. */ - remove_inode_hash(part->bd_inode); + bdev_unhash(part); /* * If @disk->open_partitions isn't elevated but there's @@ -704,7 +711,7 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; + struct address_space *mapping = state->disk->part0->bd_mapping; struct folio *folio; if (n >= get_capacity(state->disk)) { diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 5e9be13a56a8..7acba66eed48 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) out[size] = 0; while (i < size) { - u8 c = le16_to_cpu(in[i]) & 0xff; + u8 c = le16_to_cpu(in[i]) & 0x7f; if (c && !isprint(c)) c = '!'; diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 38e58960ae03..2bd42fedb907 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -131,8 +131,7 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); return false; } - strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); - toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; + strscpy_pad(toc->bitmap1_name, data + 0x24, sizeof(toc->bitmap1_name)); toc->bitmap1_start = get_unaligned_be64(data + 0x2E); toc->bitmap1_size = get_unaligned_be64(data + 0x36); @@ -142,8 +141,7 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) TOC_BITMAP1, toc->bitmap1_name); return false; } - strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); - toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; + strscpy_pad(toc->bitmap2_name, data + 0x46, sizeof(toc->bitmap2_name)); toc->bitmap2_start = get_unaligned_be64(data + 0x50); toc->bitmap2_size = get_unaligned_be64(data + 0x58); if (strncmp (toc->bitmap2_name, TOC_BITMAP2, diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index 0a747a0c782d..aa3bd050d8cd 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * ldm - Part of the Linux-NTFS project. * * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> @@ -15,7 +15,7 @@ #include <linux/types.h> #include <linux/list.h> #include <linux/fs.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <asm/byteorder.h> struct parsed_partitions; diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c80183156d68..b02530d98629 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); + + /* + * If the "block size" is not a power of 2, things get weird - we might + * end up with a partition straddling a sector boundary, so we wouldn't + * be able to read a partition entry with read_part_sector(). + * Real block sizes are probably (?) powers of two, so just require + * that. + */ + if (!is_power_of_2(secsize)) + return -1; datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, §); if (!data) return -1; partoffset = secsize % 512; - if (partoffset + sizeof(*part) > datasize) + if (partoffset + sizeof(*part) > datasize) { + put_dev_sector(sect); return -1; + } part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); @@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state) int i, l; goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) + l = strnlen(part->name, sizeof(part->name)); + if (strncmp(part->name, "/", sizeof(part->name)) == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index b5d5c229cc3b..073be78ba0b0 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -36,7 +36,7 @@ * the nr_sects and start_sect partition table entries are * at a 2 (mod 4) address. */ -#include <asm/unaligned.h> +#include <linux/unaligned.h> static inline sector_t nr_sects(struct msdos_partition *p) { diff --git a/block/partitions/of.c b/block/partitions/of.c new file mode 100644 index 000000000000..4e760fdffb3f --- /dev/null +++ b/block/partitions/of.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/blkdev.h> +#include <linux/major.h> +#include <linux/of.h> +#include <linux/string.h> +#include "check.h" + +static int validate_of_partition(struct device_node *np, int slot) +{ + u64 offset, size; + int len; + + const __be32 *reg = of_get_property(np, "reg", &len); + int a_cells = of_n_addr_cells(np); + int s_cells = of_n_size_cells(np); + + /* Make sure reg len match the expected addr and size cells */ + if (len / sizeof(*reg) != a_cells + s_cells) + return -EINVAL; + + /* Validate offset conversion from bytes to sectors */ + offset = of_read_number(reg, a_cells); + if (offset % SECTOR_SIZE) + return -EINVAL; + + /* Validate size conversion from bytes to sectors */ + size = of_read_number(reg + a_cells, s_cells); + if (!size || size % SECTOR_SIZE) + return -EINVAL; + + return 0; +} + +static void add_of_partition(struct parsed_partitions *state, int slot, + struct device_node *np) +{ + struct partition_meta_info *info; + char tmp[sizeof(info->volname) + 4]; + const char *partname; + int len; + + const __be32 *reg = of_get_property(np, "reg", &len); + int a_cells = of_n_addr_cells(np); + int s_cells = of_n_size_cells(np); + + /* Convert bytes to sector size */ + u64 offset = of_read_number(reg, a_cells) / SECTOR_SIZE; + u64 size = of_read_number(reg + a_cells, s_cells) / SECTOR_SIZE; + + put_partition(state, slot, offset, size); + + if (of_property_read_bool(np, "read-only")) + state->parts[slot].flags |= ADDPART_FLAG_READONLY; + + /* + * Follow MTD label logic, search for label property, + * fallback to node name if not found. + */ + info = &state->parts[slot].info; + partname = of_get_property(np, "label", &len); + if (!partname) + partname = of_get_property(np, "name", &len); + strscpy(info->volname, partname, sizeof(info->volname)); + + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); + strlcat(state->pp_buf, tmp, PAGE_SIZE); +} + +int of_partition(struct parsed_partitions *state) +{ + struct device *ddev = disk_to_dev(state->disk); + struct device_node *np; + int slot; + + struct device_node *partitions_np = of_node_get(ddev->of_node); + + if (!partitions_np || + !of_device_is_compatible(partitions_np, "fixed-partitions")) + return 0; + + slot = 1; + /* Validate parition offset and size */ + for_each_child_of_node(partitions_np, np) { + if (validate_of_partition(np, slot)) { + of_node_put(np); + of_node_put(partitions_np); + + return -1; + } + + slot++; + } + + slot = 1; + for_each_child_of_node(partitions_np, np) { + if (slot >= state->limit) { + of_node_put(np); + break; + } + + add_of_partition(state, slot, np); + + slot++; + } + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 9cc6b8c1eea4..b5ecddd5181a 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -50,8 +50,6 @@ int sgi_partition(struct parsed_partitions *state) p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { - /*printk("Dev %s SGI disklabel: bad magic %08x\n", - state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index ddf9e6def4b2..2419af76120f 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -74,8 +74,6 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { -/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } diff --git a/block/sed-opal.c b/block/sed-opal.c index 14fe0fef811c..5a28f23f7f22 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -314,7 +314,7 @@ static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen) &key_type_user, key_name, true); if (IS_ERR(kref)) - ret = PTR_ERR(kref); + return PTR_ERR(kref); key = key_ref_to_ptr(kref); down_read(&key->sem); @@ -3037,6 +3037,29 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) return ret; } +static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) +{ + int ret; + struct opal_key *newkey = &opal_pw->new_user_pw.opal_key; + struct opal_key *oldkey = &opal_pw->session.opal_key; + + const struct opal_step pw_steps[] = { + { start_SIDASP_opal_session, oldkey }, + { set_sid_cpin_pin, newkey }, + { end_opal_session, } + }; + + if (!dev) + return -ENODEV; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_activate_user(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -3286,6 +3309,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_DISCOVERY: ret = opal_get_discv(dev, p); break; + case IOC_OPAL_SET_SID_PW: + ret = opal_set_new_sid_pw(dev, p); + break; default: break; diff --git a/block/t10-pi.c b/block/t10-pi.c index d90892fd6f2a..851db518ee5e 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -8,20 +8,25 @@ #include <linux/blk-integrity.h> #include <linux/crc-t10dif.h> #include <linux/crc64.h> -#include <linux/module.h> #include <net/checksum.h> -#include <asm/unaligned.h> - -typedef __be16 (csum_fn) (__be16, void *, unsigned int); - -static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) -{ - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); -} +#include <linux/unaligned.h> +#include "blk.h" + +struct blk_integrity_iter { + void *prot_buf; + void *data_buf; + sector_t seed; + unsigned int data_size; + unsigned short interval; + const char *disk_name; +}; -static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) +static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, + unsigned char csum_type) { - return (__force __be16)ip_compute_csum(data, len); + if (csum_type == BLK_INTEGRITY_CSUM_IP) + return (__force __be16)ip_compute_csum(data, len); + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); } /* @@ -29,48 +34,44 @@ static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref * tag. */ -static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) +static void t10_pi_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = fn(0, iter->data_buf, iter->interval); + pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, - offset); + pi->guard_tag = t10_pi_csum(pi->guard_tag, + iter->prot_buf, offset, bi->csum_type); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; - BUG_ON(type == T10_PI_TYPE0_PROTECTION); - for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; - if (type == T10_PI_TYPE1_PROTECTION || - type == T10_PI_TYPE2_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -82,15 +83,17 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; } - csum = fn(0, iter->data_buf, iter->interval); + csum = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - csum = fn(csum, iter->prot_buf, offset); + csum = t10_pi_csum(csum, iter->prot_buf, offset, + bi->csum_type); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -102,33 +105,13 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - /** * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared @@ -141,7 +124,7 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) */ static void t10_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -192,7 +175,7 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); @@ -225,81 +208,15 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_prepare(struct request *rq) -{ -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) { - return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); + return cpu_to_be64(crc64_nvme(crc, data, len)); } -static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, - enum t10_dif_type type) +static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { @@ -311,30 +228,28 @@ static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, iter->prot_buf, offset); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) put_unaligned_be48(iter->seed, pi->ref_tag); else put_unaligned_be48(0ULL, pi->ref_tag); iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } -static bool ext_pi_ref_escape(u8 *ref_tag) +static bool ext_pi_ref_escape(const u8 ref_tag[6]) { - static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; } static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { @@ -342,7 +257,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, u64 ref, seed; __be64 csum; - if (type == T10_PI_TYPE1_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -353,7 +268,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, iter->disk_name, seed, ref); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) goto next; @@ -374,26 +289,16 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION); -} - static void ext_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -433,7 +338,7 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); @@ -467,33 +372,102 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter) +void blk_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = bio->bi_iter.bi_sector; + iter.prot_buf = bvec_virt(bip->bip_vec); + bio_for_each_segment(bv, bio, bviter) { + void *kaddr = bvec_kmap_local(&bv); + + iter.data_buf = kaddr; + iter.data_size = bv.bv_len; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ext_pi_crc64_generate(&iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + t10_pi_generate(&iter, bi); + break; + default: + break; + } + kunmap_local(kaddr); + } +} + +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter) { - return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + + /* + * At the moment verify is called bi_iter has been advanced during split + * and completion, so use the copy created during submission here. + */ + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = saved_iter->bi_sector; + iter.prot_buf = bvec_virt(bip->bip_vec); + __bio_for_each_segment(bv, bio, bviter, *saved_iter) { + void *kaddr = bvec_kmap_local(&bv); + blk_status_t ret = BLK_STS_OK; + + iter.data_buf = kaddr; + iter.data_size = bv.bv_len; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ret = ext_pi_crc64_verify(&iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + ret = t10_pi_verify(&iter, bi); + break; + default: + break; + } + kunmap_local(kaddr); + + if (ret) { + bio->bi_status = ret; + return; + } + } } -static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter) +void blk_integrity_prepare(struct request *rq) { - return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION); + struct blk_integrity *bi = &rq->q->limits.integrity; + + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_prepare(rq); + else + t10_pi_type1_prepare(rq); } -const struct blk_integrity_profile ext_pi_type1_crc64 = { - .name = "EXT-DIF-TYPE1-CRC64", - .generate_fn = ext_pi_type1_generate_crc64, - .verify_fn = ext_pi_type1_verify_crc64, - .prepare_fn = ext_pi_type1_prepare, - .complete_fn = ext_pi_type1_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type1_crc64); - -const struct blk_integrity_profile ext_pi_type3_crc64 = { - .name = "EXT-DIF-TYPE3-CRC64", - .generate_fn = ext_pi_type3_generate_crc64, - .verify_fn = ext_pi_type3_verify_crc64, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type3_crc64); +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ + struct blk_integrity *bi = &rq->q->limits.integrity; -MODULE_LICENSE("GPL"); -MODULE_LICENSE("GPL"); + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_complete(rq, nr_bytes); + else + t10_pi_type1_complete(rq, nr_bytes); +} |