summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig32
-rw-r--r--block/Makefile12
-rw-r--r--block/badblocks.c327
-rw-r--r--block/bdev.c204
-rw-r--r--block/bfq-cgroup.c58
-rw-r--r--block/bfq-iosched.c296
-rw-r--r--block/bfq-iosched.h11
-rw-r--r--block/bio-integrity-auto.c223
-rw-r--r--block/bio-integrity.c445
-rw-r--r--block/bio.c456
-rw-r--r--block/blk-cgroup-rwstat.c18
-rw-r--r--block/blk-cgroup-rwstat.h5
-rw-r--r--block/blk-cgroup.c216
-rw-r--r--block/blk-cgroup.h36
-rw-r--r--block/blk-core.c136
-rw-r--r--block/blk-crypto-fallback.c10
-rw-r--r--block/blk-crypto-internal.h10
-rw-r--r--block/blk-crypto-profile.c101
-rw-r--r--block/blk-crypto-sysfs.c35
-rw-r--r--block/blk-crypto.c204
-rw-r--r--block/blk-flush.c51
-rw-r--r--block/blk-ia-ranges.c4
-rw-r--r--block/blk-integrity.c263
-rw-r--r--block/blk-ioc.c9
-rw-r--r--block/blk-iocost.c56
-rw-r--r--block/blk-iolatency.c6
-rw-r--r--block/blk-ioprio.c80
-rw-r--r--block/blk-ioprio.h9
-rw-r--r--block/blk-lib.c285
-rw-r--r--block/blk-map.c260
-rw-r--r--block/blk-merge.c488
-rw-r--r--block/blk-mq-cpumap.c36
-rw-r--r--block/blk-mq-debugfs-zoned.c22
-rw-r--r--block/blk-mq-debugfs.c115
-rw-r--r--block/blk-mq-debugfs.h6
-rw-r--r--block/blk-mq-dma.c116
-rw-r--r--block/blk-mq-pci.c46
-rw-r--r--block/blk-mq-sched.c58
-rw-r--r--block/blk-mq-sysfs.c44
-rw-r--r--block/blk-mq-tag.c49
-rw-r--r--block/blk-mq-virtio.c46
-rw-r--r--block/blk-mq.c1088
-rw-r--r--block/blk-mq.h70
-rw-r--r--block/blk-pm.c2
-rw-r--r--block/blk-rq-qos.c102
-rw-r--r--block/blk-rq-qos.h21
-rw-r--r--block/blk-settings.c1064
-rw-r--r--block/blk-stat.c5
-rw-r--r--block/blk-stat.h3
-rw-r--r--block/blk-sysfs.c731
-rw-r--r--block/blk-throttle.c1603
-rw-r--r--block/blk-throttle.h92
-rw-r--r--block/blk-wbt.c48
-rw-r--r--block/blk-zoned.c1681
-rw-r--r--block/blk.h294
-rw-r--r--block/bounce.c269
-rw-r--r--block/bsg-lib.c11
-rw-r--r--block/early-lookup.c2
-rw-r--r--block/elevator.c400
-rw-r--r--block/elevator.h13
-rw-r--r--block/fops.c186
-rw-r--r--block/genhd.c478
-rw-r--r--block/ioctl.c251
-rw-r--r--block/ioprio.c6
-rw-r--r--block/kyber-iosched.c4
-rw-r--r--block/mq-deadline.c244
-rw-r--r--block/partitions/Kconfig9
-rw-r--r--block/partitions/Makefile1
-rw-r--r--block/partitions/check.h1
-rw-r--r--block/partitions/cmdline.c52
-rw-r--r--block/partitions/core.c41
-rw-r--r--block/partitions/efi.c2
-rw-r--r--block/partitions/ldm.c6
-rw-r--r--block/partitions/ldm.h4
-rw-r--r--block/partitions/mac.c18
-rw-r--r--block/partitions/msdos.c2
-rw-r--r--block/partitions/of.c110
-rw-r--r--block/partitions/sgi.c2
-rw-r--r--block/partitions/sun.c2
-rw-r--r--block/sed-opal.c28
-rw-r--r--block/t10-pi.c312
81 files changed, 7581 insertions, 6561 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..15027963472d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -62,6 +62,8 @@ config BLK_DEV_BSGLIB
config BLK_DEV_INTEGRITY
bool "Block layer data integrity support"
+ select CRC_T10DIF
+ select CRC64
help
Some storage devices allow extra information to be
stored/retrieved to help protect the data. The block layer
@@ -72,12 +74,6 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
-config BLK_DEV_INTEGRITY_T10
- tristate
- depends on BLK_DEV_INTEGRITY
- select CRC_T10DIF
- select CRC64_ROCKSOFT
-
config BLK_DEV_WRITE_MOUNTED
bool "Allow writing to mounted block devices"
default y
@@ -100,7 +96,6 @@ config BLK_DEV_WRITE_MOUNTED
config BLK_DEV_ZONED
bool "Zoned block device support"
- select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
@@ -120,17 +115,6 @@ config BLK_DEV_THROTTLING
See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
-config BLK_DEV_THROTTLING_LOW
- bool "Block throttling .low limit interface support (EXPERIMENTAL)"
- depends on BLK_DEV_THROTTLING
- help
- Add .low limit interface for block throttling. The low limit is a best
- effort limit to prioritize cgroups. Depending on the setting, the limit
- can be used to protect cgroups in terms of bandwidth/iops and better
- utilize disk resource.
-
- Note, this is an experimental interface and could be changed someday.
-
config BLK_WBT
bool "Enable support for block device writeback throttling"
help
@@ -198,10 +182,6 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
-config BLK_DEBUG_FS_ZONED
- bool
- default BLK_DEBUG_FS && BLK_DEV_ZONED
-
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
@@ -231,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
source "block/partitions/Kconfig"
-config BLK_MQ_PCI
- def_bool PCI
-
-config BLK_MQ_VIRTIO
- bool
- depends on VIRTIO
- default y
-
config BLK_PM
def_bool PM
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..c65f4da93702 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,13 +5,12 @@
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-merge.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
+ blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
+ blk-mq-tag.o blk-mq-dma.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o blk-ia-ranges.o early-lookup.o
-obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
@@ -26,14 +25,11 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o
-obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
-obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \
+ bio-integrity-auto.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
-obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
diff --git a/block/badblocks.c b/block/badblocks.c
index db4ec8b9b2a8..ece64e76fe8f 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -528,51 +528,6 @@ out:
}
/*
- * Return 'true' if the range indicated by 'bad' can be backward merged
- * with the bad range (from the bad table) index by 'behind'.
- */
-static bool can_merge_behind(struct badblocks *bb,
- struct badblocks_context *bad, int behind)
-{
- sector_t sectors = bad->len;
- sector_t s = bad->start;
- u64 *p = bb->page;
-
- if ((s < BB_OFFSET(p[behind])) &&
- ((s + sectors) >= BB_OFFSET(p[behind])) &&
- ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
- BB_ACK(p[behind]) == bad->ack)
- return true;
- return false;
-}
-
-/*
- * Do backward merge for range indicated by 'bad' and the bad range
- * (from the bad table) indexed by 'behind'. The return value is merged
- * sectors from bad->len.
- */
-static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
- int behind)
-{
- sector_t sectors = bad->len;
- sector_t s = bad->start;
- u64 *p = bb->page;
- int merged = 0;
-
- WARN_ON(s >= BB_OFFSET(p[behind]));
- WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
-
- if (s < BB_OFFSET(p[behind])) {
- merged = BB_OFFSET(p[behind]) - s;
- p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
-
- WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
- }
-
- return merged;
-}
-
-/*
* Return 'true' if the range indicated by 'bad' can be forward
* merged with the bad range (from the bad table) indexed by 'prev'.
*/
@@ -745,7 +700,7 @@ static bool can_front_overwrite(struct badblocks *bb, int prev,
*extra = 2;
}
- if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
+ if ((bb->count + (*extra)) > MAX_BADBLOCKS)
return false;
return true;
@@ -855,40 +810,60 @@ static void badblocks_update_acked(struct badblocks *bb)
bb->unacked_exist = 0;
}
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly backward
+ * overlapped with the bad range (from bad table) indexed by 'behind'.
+ */
+static bool try_adjacent_combine(struct badblocks *bb, int prev)
+{
+ u64 *p = bb->page;
+
+ if (prev >= 0 && (prev + 1) < bb->count &&
+ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
+ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
+ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
+ BB_ACK(p[prev]));
+
+ if ((prev + 2) < bb->count)
+ memmove(p + prev + 1, p + prev + 2,
+ (bb->count - (prev + 2)) * 8);
+ bb->count--;
+ return true;
+ }
+ return false;
+}
+
/* Do exact work to set bad block range into the bad block table */
-static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
+static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors,
+ int acknowledged)
{
- int retried = 0, space_desired = 0;
- int orig_len, len = 0, added = 0;
+ int len = 0, added = 0;
struct badblocks_context bad;
int prev = -1, hint = -1;
- sector_t orig_start;
unsigned long flags;
- int rv = 0;
u64 *p;
if (bb->shift < 0)
/* badblocks are disabled */
- return 1;
+ return false;
if (sectors == 0)
/* Invalid sectors number */
- return 1;
+ return false;
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
- rounddown(s, bb->shift);
- roundup(next, bb->shift);
+ rounddown(s, 1 << bb->shift);
+ roundup(next, 1 << bb->shift);
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
- orig_start = s;
- orig_len = sectors;
bad.ack = acknowledged;
p = bb->page;
@@ -897,6 +872,9 @@ re_insert:
bad.len = sectors;
len = 0;
+ if (badblocks_full(bb))
+ goto out;
+
if (badblocks_empty(bb)) {
len = insert_at(bb, 0, &bad);
bb->count++;
@@ -908,32 +886,14 @@ re_insert:
/* start before all badblocks */
if (prev < 0) {
- if (!badblocks_full(bb)) {
- /* insert on the first */
- if (bad.len > (BB_OFFSET(p[0]) - bad.start))
- bad.len = BB_OFFSET(p[0]) - bad.start;
- len = insert_at(bb, 0, &bad);
- bb->count++;
- added++;
- hint = 0;
- goto update_sectors;
- }
-
- /* No sapce, try to merge */
- if (overlap_behind(bb, &bad, 0)) {
- if (can_merge_behind(bb, &bad, 0)) {
- len = behind_merge(bb, &bad, 0);
- added++;
- } else {
- len = BB_OFFSET(p[0]) - s;
- space_desired = 1;
- }
- hint = 0;
- goto update_sectors;
- }
-
- /* no table space and give up */
- goto out;
+ /* insert on the first */
+ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
+ bad.len = BB_OFFSET(p[0]) - bad.start;
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ hint = ++prev;
+ goto update_sectors;
}
/* in case p[prev-1] can be merged with p[prev] */
@@ -945,33 +905,6 @@ re_insert:
goto update_sectors;
}
- if (overlap_front(bb, prev, &bad)) {
- if (can_merge_front(bb, prev, &bad)) {
- len = front_merge(bb, prev, &bad);
- added++;
- } else {
- int extra = 0;
-
- if (!can_front_overwrite(bb, prev, &bad, &extra)) {
- len = min_t(sector_t,
- BB_END(p[prev]) - s, sectors);
- hint = prev;
- goto update_sectors;
- }
-
- len = front_overwrite(bb, prev, &bad, extra);
- added++;
- bb->count += extra;
-
- if (can_combine_front(bb, prev, &bad)) {
- front_combine(bb, prev);
- bb->count--;
- }
- }
- hint = prev;
- goto update_sectors;
- }
-
if (can_merge_front(bb, prev, &bad)) {
len = front_merge(bb, prev, &bad);
added++;
@@ -979,21 +912,29 @@ re_insert:
goto update_sectors;
}
- /* if no space in table, still try to merge in the covered range */
- if (badblocks_full(bb)) {
- /* skip the cannot-merge range */
- if (((prev + 1) < bb->count) &&
- overlap_behind(bb, &bad, prev + 1) &&
- ((s + sectors) >= BB_END(p[prev + 1]))) {
- len = BB_END(p[prev + 1]) - s;
- hint = prev + 1;
+ if (overlap_front(bb, prev, &bad)) {
+ int extra = 0;
+
+ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
+ if (extra > 0)
+ goto out;
+
+ len = min_t(sector_t,
+ BB_END(p[prev]) - s, sectors);
+ hint = prev;
goto update_sectors;
}
- /* no retry any more */
- len = sectors;
- space_desired = 1;
- hint = -1;
+ len = front_overwrite(bb, prev, &bad, extra);
+ added++;
+ bb->count += extra;
+
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
+ }
+
+ hint = prev;
goto update_sectors;
}
@@ -1006,7 +947,7 @@ re_insert:
len = insert_at(bb, prev + 1, &bad);
bb->count++;
added++;
- hint = prev + 1;
+ hint = ++prev;
update_sectors:
s += len;
@@ -1015,35 +956,12 @@ update_sectors:
if (sectors > 0)
goto re_insert;
- WARN_ON(sectors < 0);
-
/*
* Check whether the following already set range can be
* merged. (prev < 0) condition is not handled here,
* because it's already complicated enough.
*/
- if (prev >= 0 &&
- (prev + 1) < bb->count &&
- BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
- (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
- BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
- p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
- BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
- BB_ACK(p[prev]));
-
- if ((prev + 2) < bb->count)
- memmove(p + prev + 1, p + prev + 2,
- (bb->count - (prev + 2)) * 8);
- bb->count--;
- }
-
- if (space_desired && !badblocks_full(bb)) {
- s = orig_start;
- sectors = orig_len;
- space_desired = 0;
- if (retried++ < 3)
- goto re_insert;
- }
+ try_adjacent_combine(bb, prev);
out:
if (added) {
@@ -1057,10 +975,7 @@ out:
write_sequnlock_irqrestore(&bb->lock, flags);
- if (!added)
- rv = 1;
-
- return rv;
+ return sectors == 0;
}
/*
@@ -1131,21 +1046,20 @@ static int front_splitting_clear(struct badblocks *bb, int prev,
}
/* Do the exact work to clear bad block range from the bad block table */
-static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)
{
struct badblocks_context bad;
int prev = -1, hint = -1;
int len = 0, cleared = 0;
- int rv = 0;
u64 *p;
if (bb->shift < 0)
/* badblocks are disabled */
- return 1;
+ return false;
if (sectors == 0)
/* Invalid sectors number */
- return 1;
+ return false;
if (bb->shift) {
sector_t target;
@@ -1157,8 +1071,8 @@ static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
* isn't than to think a block is not bad when it is.
*/
target = s + sectors;
- roundup(s, bb->shift);
- rounddown(target, bb->shift);
+ roundup(s, 1 << bb->shift);
+ rounddown(target, 1 << bb->shift);
sectors = target - s;
}
@@ -1214,7 +1128,7 @@ re_clear:
if ((BB_OFFSET(p[prev]) < bad.start) &&
(BB_END(p[prev]) > (bad.start + bad.len))) {
/* Splitting */
- if ((bb->count + 1) < MAX_BADBLOCKS) {
+ if ((bb->count + 1) <= MAX_BADBLOCKS) {
len = front_splitting_clear(bb, prev, &bad);
bb->count += 1;
cleared++;
@@ -1255,8 +1169,6 @@ update_sectors:
if (sectors > 0)
goto re_clear;
- WARN_ON(sectors < 0);
-
if (cleared) {
badblocks_update_acked(bb);
set_changed(bb);
@@ -1265,40 +1177,21 @@ update_sectors:
write_sequnlock_irq(&bb->lock);
if (!cleared)
- rv = 1;
+ return false;
- return rv;
+ return true;
}
/* Do the exact work to check bad blocks range from the bad block table */
-static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors,
+ sector_t *first_bad, sector_t *bad_sectors)
{
- int unacked_badblocks, acked_badblocks;
int prev = -1, hint = -1, set = 0;
struct badblocks_context bad;
- unsigned int seq;
+ int unacked_badblocks = 0;
+ int acked_badblocks = 0;
+ u64 *p = bb->page;
int len, rv;
- u64 *p;
-
- WARN_ON(bb->shift < 0 || sectors == 0);
-
- if (bb->shift > 0) {
- sector_t target;
-
- /* round the start down, and the end up */
- target = s + sectors;
- rounddown(s, bb->shift);
- roundup(target, bb->shift);
- sectors = target - s;
- }
-
-retry:
- seq = read_seqbegin(&bb->lock);
-
- p = bb->page;
- unacked_badblocks = 0;
- acked_badblocks = 0;
re_check:
bad.start = s;
@@ -1349,14 +1242,15 @@ re_check:
len = sectors;
update_sectors:
+ /* This situation should never happen */
+ WARN_ON(sectors < len);
+
s += len;
sectors -= len;
if (sectors > 0)
goto re_check;
- WARN_ON(sectors < 0);
-
if (unacked_badblocks > 0)
rv = -1;
else if (acked_badblocks > 0)
@@ -1364,9 +1258,6 @@ update_sectors:
else
rv = 0;
- if (read_seqretry(&bb->lock, seq))
- goto retry;
-
return rv;
}
@@ -1404,10 +1295,30 @@ update_sectors:
* -1: there are bad blocks which have not yet been acknowledged in metadata.
* plus the start/length of the first bad section we overlap.
*/
-int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors,
+ sector_t *first_bad, sector_t *bad_sectors)
{
- return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ unsigned int seq;
+ int rv;
+
+ WARN_ON(bb->shift < 0 || sectors == 0);
+
+ if (bb->shift > 0) {
+ /* round the start down, and the end up */
+ sector_t target = s + sectors;
+
+ rounddown(s, 1 << bb->shift);
+ roundup(target, 1 << bb->shift);
+ sectors = target - s;
+ }
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+ rv = _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
}
EXPORT_SYMBOL_GPL(badblocks_check);
@@ -1423,11 +1334,12 @@ EXPORT_SYMBOL_GPL(badblocks_check);
* decide how best to handle it.
*
* Return:
- * 0: success
- * 1: failed to set badblocks (out of space)
+ * true: success
+ * false: failed to set badblocks (out of space). Parital setting will be
+ * treated as failure.
*/
-int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
+bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors,
+ int acknowledged)
{
return _badblocks_set(bb, s, sectors, acknowledged);
}
@@ -1444,10 +1356,10 @@ EXPORT_SYMBOL_GPL(badblocks_set);
* drop the remove request.
*
* Return:
- * 0: success
- * 1: failed to clear badblocks
+ * true: success
+ * false: failed to clear badblocks
*/
-int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)
{
return _badblocks_clear(bb, s, sectors);
}
@@ -1479,6 +1391,11 @@ void ack_all_badblocks(struct badblocks *bb)
p[i] = BB_MAKE(start, len, 1);
}
}
+
+ for (i = 0; i < bb->count ; i++)
+ while (try_adjacent_combine(bb, i))
+ ;
+
bb->unacked_exist = 0;
}
write_sequnlock_irq(&bb->lock);
@@ -1564,10 +1481,10 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
return -EINVAL;
}
- if (badblocks_set(bb, sector, length, !unack))
+ if (!badblocks_set(bb, sector, length, !unack))
return -ENOSPC;
- else
- return len;
+
+ return len;
}
EXPORT_SYMBOL_GPL(badblocks_store);
diff --git a/block/bdev.c b/block/bdev.c
index da2a167a4d08..b77ddd12dc06 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -24,6 +24,7 @@
#include <linux/pseudo_fs.h>
#include <linux/uio.h>
#include <linux/namei.h>
+#include <linux/security.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include <linux/stat.h>
@@ -43,6 +44,11 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode)
return container_of(inode, struct bdev_inode, vfs_inode);
}
+static inline struct inode *BD_INODE(struct block_device *bdev)
+{
+ return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode;
+}
+
struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
@@ -57,7 +63,7 @@ EXPORT_SYMBOL(file_bdev);
static void bdev_write_inode(struct block_device *bdev)
{
- struct inode *inode = bdev->bd_inode;
+ struct inode *inode = BD_INODE(bdev);
int ret;
spin_lock(&inode->i_lock);
@@ -76,7 +82,7 @@ static void bdev_write_inode(struct block_device *bdev)
/* Kill _all_ buffers and pagecache , dirty or not.. */
static void kill_bdev(struct block_device *bdev)
{
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
if (mapping_empty(mapping))
return;
@@ -88,7 +94,7 @@ static void kill_bdev(struct block_device *bdev)
/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
- struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct address_space *mapping = bdev->bd_mapping;
if (mapping->nrpages) {
invalidate_bh_lrus();
@@ -116,7 +122,7 @@ int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
goto invalidate;
}
- truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ truncate_inode_pages_range(bdev->bd_mapping, lstart, lend);
if (!(mode & BLK_OPEN_EXCL))
bd_abort_claiming(bdev, truncate_bdev_range);
return 0;
@@ -126,7 +132,7 @@ invalidate:
* Someone else has handle exclusively open. Try invalidating instead.
* The 'end' argument is inclusive so the rounding is safe.
*/
- return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
+ return invalidate_inode_pages2_range(bdev->bd_mapping,
lstart >> PAGE_SHIFT,
lend >> PAGE_SHIFT);
}
@@ -134,31 +140,77 @@ invalidate:
static void set_init_blocksize(struct block_device *bdev)
{
unsigned int bsize = bdev_logical_block_size(bdev);
- loff_t size = i_size_read(bdev->bd_inode);
+ loff_t size = i_size_read(BD_INODE(bdev));
while (bsize < PAGE_SIZE) {
if (size & bsize)
break;
bsize <<= 1;
}
- bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+ BD_INODE(bdev)->i_blkbits = blksize_bits(bsize);
+ mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping,
+ get_order(bsize));
}
-int set_blocksize(struct block_device *bdev, int size)
+/**
+ * bdev_validate_blocksize - check that this block size is acceptable
+ * @bdev: blockdevice to check
+ * @block_size: block size to check
+ *
+ * For block device users that do not use buffer heads or the block device
+ * page cache, make sure that this block size can be used with the device.
+ *
+ * Return: On success zero is returned, negative error code on failure.
+ */
+int bdev_validate_blocksize(struct block_device *bdev, int block_size)
{
- /* Size must be a power of two, and between 512 and PAGE_SIZE */
- if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
+ if (blk_validate_block_size(block_size))
return -EINVAL;
/* Size cannot be smaller than the size supported by the device */
- if (size < bdev_logical_block_size(bdev))
+ if (block_size < bdev_logical_block_size(bdev))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bdev_validate_blocksize);
+
+int set_blocksize(struct file *file, int size)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct block_device *bdev = I_BDEV(inode);
+ int ret;
+
+ ret = bdev_validate_blocksize(bdev, size);
+ if (ret)
+ return ret;
+
+ if (!file->private_data)
return -EINVAL;
/* Don't change the size if it is same as current */
- if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
+ if (inode->i_blkbits != blksize_bits(size)) {
+ /*
+ * Flush and truncate the pagecache before we reconfigure the
+ * mapping geometry because folio sizes are variable now. If a
+ * reader has already allocated a folio whose size is smaller
+ * than the new min_order but invokes readahead after the new
+ * min_order becomes visible, readahead will think there are
+ * "zero" blocks per folio and crash. Take the inode and
+ * invalidation locks to avoid racing with
+ * read/write/fallocate.
+ */
+ inode_lock(inode);
+ filemap_invalidate_lock(inode->i_mapping);
+
sync_blockdev(bdev);
- bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
+
+ inode->i_blkbits = blksize_bits(size);
+ mapping_set_folio_min_order(inode->i_mapping, get_order(size));
+ kill_bdev(bdev);
+ filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
}
return 0;
}
@@ -167,10 +219,11 @@ EXPORT_SYMBOL(set_blocksize);
int sb_set_blocksize(struct super_block *sb, int size)
{
- if (set_blocksize(sb->s_bdev, size))
+ if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE)
return 0;
- /* If we get here, we know size is power of two
- * and it's value is between 512 and PAGE_SIZE */
+ if (set_blocksize(sb->s_bdev_file, size))
+ return 0;
+ /* If we get here, we know size is validated */
sb->s_blocksize = size;
sb->s_blocksize_bits = blksize_bits(size);
return sb->s_blocksize;
@@ -192,7 +245,7 @@ int sync_blockdev_nowait(struct block_device *bdev)
{
if (!bdev)
return 0;
- return filemap_flush(bdev->bd_inode->i_mapping);
+ return filemap_flush(bdev->bd_mapping);
}
EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
@@ -204,13 +257,13 @@ int sync_blockdev(struct block_device *bdev)
{
if (!bdev)
return 0;
- return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ return filemap_write_and_wait(bdev->bd_mapping);
}
EXPORT_SYMBOL(sync_blockdev);
int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
{
- return filemap_write_and_wait_range(bdev->bd_inode->i_mapping,
+ return filemap_write_and_wait_range(bdev->bd_mapping,
lstart, lend);
}
EXPORT_SYMBOL(sync_blockdev_range);
@@ -313,6 +366,11 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
if (!ei)
return NULL;
memset(&ei->bdev, 0, sizeof(ei->bdev));
+
+ if (security_bdev_alloc(&ei->bdev)) {
+ kmem_cache_free(bdev_cachep, ei);
+ return NULL;
+ }
return &ei->vfs_inode;
}
@@ -322,6 +380,7 @@ static void bdev_free_inode(struct inode *inode)
free_percpu(bdev->bd_stats);
kfree(bdev->bd_meta_info);
+ security_bdev_free(bdev);
if (!bdev_is_partition(bdev)) {
if (bdev->bd_disk && bdev->bd_disk->bdi)
@@ -374,7 +433,7 @@ static struct file_system_type bd_type = {
};
struct super_block *blockdev_superblock __ro_after_init;
-struct vfsmount *blockdev_mnt __ro_after_init;
+static struct vfsmount *blockdev_mnt __ro_after_init;
EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
@@ -411,13 +470,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
mutex_init(&bdev->bd_fsfreeze_mutex);
spin_lock_init(&bdev->bd_size_lock);
mutex_init(&bdev->bd_holder_lock);
- bdev->bd_partno = partno;
- bdev->bd_inode = inode;
+ atomic_set(&bdev->__bd_flags, partno);
+ bdev->bd_mapping = &inode->i_data;
bdev->bd_queue = disk->queue;
- if (partno)
- bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio;
- else
- bdev->bd_has_submit_bio = false;
+ if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO))
+ bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO);
bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) {
iput(inode);
@@ -430,19 +487,30 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
spin_lock(&bdev->bd_size_lock);
- i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT);
bdev->bd_nr_sectors = sectors;
spin_unlock(&bdev->bd_size_lock);
}
void bdev_add(struct block_device *bdev, dev_t dev)
{
+ struct inode *inode = BD_INODE(bdev);
if (bdev_stable_writes(bdev))
- mapping_set_stable_writes(bdev->bd_inode->i_mapping);
+ mapping_set_stable_writes(bdev->bd_mapping);
bdev->bd_dev = dev;
- bdev->bd_inode->i_rdev = dev;
- bdev->bd_inode->i_ino = dev;
- insert_inode_hash(bdev->bd_inode);
+ inode->i_rdev = dev;
+ inode->i_ino = dev;
+ insert_inode_hash(inode);
+}
+
+void bdev_unhash(struct block_device *bdev)
+{
+ remove_inode_hash(BD_INODE(bdev));
+}
+
+void bdev_drop(struct block_device *bdev)
+{
+ iput(BD_INODE(bdev));
}
long nr_blockdev_pages(void)
@@ -528,7 +596,7 @@ retry:
/* if claiming is already in progress, wait for it to finish */
if (whole->bd_claiming) {
- wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+ wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming);
DEFINE_WAIT(wait);
prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
@@ -551,7 +619,7 @@ static void bd_clear_claiming(struct block_device *whole, void *holder)
/* tell others that we're done */
BUG_ON(whole->bd_claiming != holder);
whole->bd_claiming = NULL;
- wake_up_bit(&whole->bd_claiming, 0);
+ wake_up_var(&whole->bd_claiming);
}
/**
@@ -620,7 +688,7 @@ static void bd_end_claim(struct block_device *bdev, void *holder)
bdev->bd_holder = NULL;
bdev->bd_holder_ops = NULL;
mutex_unlock(&bdev->bd_holder_lock);
- if (bdev->bd_write_holder)
+ if (bdev_test_flag(bdev, BD_WRITE_HOLDER))
unblock = true;
}
if (!whole->bd_holders)
@@ -633,7 +701,7 @@ static void bd_end_claim(struct block_device *bdev, void *holder)
*/
if (unblock) {
disk_unblock_events(bdev->bd_disk);
- bdev->bd_write_holder = false;
+ bdev_clear_flag(bdev, BD_WRITE_HOLDER);
}
}
@@ -747,13 +815,13 @@ static void blkdev_put_part(struct block_device *part)
blkdev_put_whole(whole);
}
-struct block_device *blkdev_get_no_open(dev_t dev)
+struct block_device *blkdev_get_no_open(dev_t dev, bool autoload)
{
struct block_device *bdev;
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
- if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
+ if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
if (inode)
@@ -900,9 +968,10 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
* writeable reference is too fragile given the way @mode is
* used in blkdev_get/put().
*/
- if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder &&
+ if ((mode & BLK_OPEN_WRITE) &&
+ !bdev_test_flag(bdev, BD_WRITE_HOLDER) &&
(disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
- bdev->bd_write_holder = true;
+ bdev_set_flag(bdev, BD_WRITE_HOLDER);
unblock_events = false;
}
}
@@ -912,12 +981,12 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
disk_unblock_events(disk);
bdev_file->f_flags |= O_LARGEFILE;
- bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
+ bdev_file->f_mode |= FMODE_CAN_ODIRECT;
if (bdev_nowait(bdev))
bdev_file->f_mode |= FMODE_NOWAIT;
if (mode & BLK_OPEN_RESTRICT_WRITES)
bdev_file->f_mode |= FMODE_WRITE_RESTRICTED;
- bdev_file->f_mapping = bdev->bd_inode->i_mapping;
+ bdev_file->f_mapping = bdev->bd_mapping;
bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
bdev_file->private_data = holder;
@@ -974,18 +1043,18 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (ret)
return ERR_PTR(ret);
- bdev = blkdev_get_no_open(dev);
+ bdev = blkdev_get_no_open(dev, true);
if (!bdev)
return ERR_PTR(-ENXIO);
flags = blk_to_file_flags(mode);
- bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode,
+ bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev),
blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
if (IS_ERR(bdev_file)) {
blkdev_put_no_open(bdev);
return bdev_file;
}
- ihold(bdev->bd_inode);
+ ihold(BD_INODE(bdev));
ret = bdev_open(bdev, mode, holder, hops, bdev_file);
if (ret) {
@@ -1239,27 +1308,54 @@ void sync_bdevs(bool wait)
}
/*
- * Handle STATX_DIOALIGN for block devices.
- *
- * Note that the inode passed to this is the inode of a block device node file,
- * not the block device's internal inode. Therefore it is *not* valid to use
- * I_BDEV() here; the block device has to be looked up by i_rdev instead.
+ * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices.
*/
-void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask)
{
struct block_device *bdev;
- bdev = blkdev_get_no_open(inode->i_rdev);
+ /*
+ * Note that d_backing_inode() returns the block device node inode, not
+ * the block device's internal inode. Therefore it is *not* valid to
+ * use I_BDEV() here; the block device has to be looked up by i_rdev
+ * instead.
+ */
+ bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false);
if (!bdev)
return;
- stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
- stat->dio_offset_align = bdev_logical_block_size(bdev);
- stat->result_mask |= STATX_DIOALIGN;
+ if (request_mask & STATX_DIOALIGN) {
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ stat->result_mask |= STATX_DIOALIGN;
+ }
+
+ if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) {
+ struct request_queue *bd_queue = bdev->bd_queue;
+
+ generic_fill_statx_atomic_writes(stat,
+ queue_atomic_write_unit_min_bytes(bd_queue),
+ queue_atomic_write_unit_max_bytes(bd_queue),
+ 0);
+ }
+
+ stat->blksize = bdev_io_min(bdev);
blkdev_put_no_open(bdev);
}
+bool disk_live(struct gendisk *disk)
+{
+ return !inode_unhashed(BD_INODE(disk->part0));
+}
+EXPORT_SYMBOL_GPL(disk_live);
+
+unsigned int block_size(struct block_device *bdev)
+{
+ return 1 << BD_INODE(bdev)->i_blkbits;
+}
+EXPORT_SYMBOL_GPL(block_size);
+
static int __init setup_bdev_allow_write_mounted(char *str)
{
if (kstrtobool(str, &bdev_allow_write_mounted))
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index d442ee358fc2..9fb9f3533150 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -679,12 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
bfqg_and_blkg_put(old_parent);
- if (entity->parent &&
- entity->parent->last_bfqq_created == bfqq)
- entity->parent->last_bfqq_created = NULL;
- else if (bfqd->last_bfqq_created == bfqq)
- bfqd->last_bfqq_created = NULL;
-
+ bfq_reassign_last_bfqq(bfqq, NULL);
entity->parent = bfqg->my_entity;
entity->sched_data = &bfqg->sched_data;
/* pin down bfqg and its associated blkg */
@@ -797,57 +792,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
*/
bfq_link_bfqg(bfqd, bfqg);
__bfq_bic_change_cgroup(bfqd, bic, bfqg);
- /*
- * Update blkg_path for bfq_log_* functions. We cache this
- * path, and update it here, for the following
- * reasons. Operations on blkg objects in blk-cgroup are
- * protected with the request_queue lock, and not with the
- * lock that protects the instances of this scheduler
- * (bfqd->lock). This exposes BFQ to the following sort of
- * race.
- *
- * The blkg_lookup performed in bfq_get_queue, protected
- * through rcu, may happen to return the address of a copy of
- * the original blkg. If this is the case, then the
- * bfqg_and_blkg_get performed in bfq_get_queue, to pin down
- * the blkg, is useless: it does not prevent blk-cgroup code
- * from destroying both the original blkg and all objects
- * directly or indirectly referred by the copy of the
- * blkg.
- *
- * On the bright side, destroy operations on a blkg invoke, as
- * a first step, hooks of the scheduler associated with the
- * blkg. And these hooks are executed with bfqd->lock held for
- * BFQ. As a consequence, for any blkg associated with the
- * request queue this instance of the scheduler is attached
- * to, we are guaranteed that such a blkg is not destroyed, and
- * that all the pointers it contains are consistent, while we
- * are holding bfqd->lock. A blkg_lookup performed with
- * bfqd->lock held then returns a fully consistent blkg, which
- * remains consistent until this lock is held.
- *
- * Thanks to the last fact, and to the fact that: (1) bfqg has
- * been obtained through a blkg_lookup in the above
- * assignment, and (2) bfqd->lock is being held, here we can
- * safely use the policy data for the involved blkg (i.e., the
- * field bfqg->pd) to get to the blkg associated with bfqg,
- * and then we can safely use any field of blkg. After we
- * release bfqd->lock, even just getting blkg through this
- * bfqg may cause dangling references to be traversed, as
- * bfqg->pd may not exist any more.
- *
- * In view of the above facts, here we cache, in the bfqg, any
- * blkg data we may need for this bic, and for its associated
- * bfq_queue. As of now, we need to cache only the path of the
- * blkg, which is used in the bfq_log_* functions.
- *
- * Finally, note that bfqg itself needs to be protected from
- * destruction on the blkg_free of the original blkg (which
- * invokes bfq_pd_free). We use an additional private
- * refcounter for bfqg, to let it disappear only after no
- * bfq_queue refers to it any longer.
- */
- blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path));
bic->blkcg_serial_nr = serial_nr;
}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 4b88a54a9b76..0cb1e9873aab 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -582,23 +582,31 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
#define BFQ_LIMIT_INLINE_DEPTH 16
#ifdef CONFIG_BFQ_GROUP_IOSCHED
-static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+static bool bfqq_request_over_limit(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic, blk_opf_t opf,
+ unsigned int act_idx, int limit)
{
- struct bfq_data *bfqd = bfqq->bfqd;
- struct bfq_entity *entity = &bfqq->entity;
struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
struct bfq_entity **entities = inline_entities;
- int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH;
- int class_idx = bfqq->ioprio_class - 1;
+ int alloc_depth = BFQ_LIMIT_INLINE_DEPTH;
struct bfq_sched_data *sched_data;
+ struct bfq_entity *entity;
+ struct bfq_queue *bfqq;
unsigned long wsum;
bool ret = false;
-
- if (!entity->on_st_or_in_serv)
- return false;
+ int depth;
+ int level;
retry:
spin_lock_irq(&bfqd->lock);
+ bfqq = bic_to_bfqq(bic, op_is_sync(opf), act_idx);
+ if (!bfqq)
+ goto out;
+
+ entity = &bfqq->entity;
+ if (!entity->on_st_or_in_serv)
+ goto out;
+
/* +1 for bfqq entity, root cgroup not included */
depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
if (depth > alloc_depth) {
@@ -643,7 +651,7 @@ retry:
* class.
*/
wsum = 0;
- for (i = 0; i <= class_idx; i++) {
+ for (i = 0; i <= bfqq->ioprio_class - 1; i++) {
wsum = wsum * IOPRIO_BE_NR +
sched_data->service_tree[i].wsum;
}
@@ -666,7 +674,9 @@ out:
return ret;
}
#else
-static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+static bool bfqq_request_over_limit(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic, blk_opf_t opf,
+ unsigned int act_idx, int limit)
{
return false;
}
@@ -704,8 +714,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
}
for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
- struct bfq_queue *bfqq =
- bic_to_bfqq(bic, op_is_sync(opf), act_idx);
+ /* Fast path to check if bfqq is already allocated. */
+ if (!bic_to_bfqq(bic, op_is_sync(opf), act_idx))
+ continue;
/*
* Does queue (or any parent entity) exceed number of
@@ -713,7 +724,7 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* limit depth so that it cannot consume more
* available requests and thus starve other entities.
*/
- if (bfqq && bfqq_request_over_limit(bfqq, limit)) {
+ if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
depth = 1;
break;
}
@@ -2911,8 +2922,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
/* if a merge has already been setup, then proceed with that first */
- if (bfqq->new_bfqq)
- return bfqq->new_bfqq;
+ new_bfqq = bfqq->new_bfqq;
+ if (new_bfqq) {
+ while (new_bfqq->new_bfqq)
+ new_bfqq = new_bfqq->new_bfqq;
+ return new_bfqq;
+ }
/*
* Check delayed stable merge for rotational or non-queueing
@@ -3093,8 +3108,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
}
-static void
-bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
+void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq,
+ struct bfq_queue *new_bfqq)
{
if (cur_bfqq->entity.parent &&
cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
@@ -3125,10 +3140,12 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_queue(bfqq);
}
-static void
-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
- struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+static struct bfq_queue *bfq_merge_bfqqs(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq)
{
+ struct bfq_queue *new_bfqq = bfqq->new_bfqq;
+
bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
(unsigned long)new_bfqq->pid);
/* Save weight raising and idle window of the merged queues */
@@ -3222,6 +3239,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
bfq_reassign_last_bfqq(bfqq, new_bfqq);
bfq_release_process_ref(bfqd, bfqq);
+
+ return new_bfqq;
}
static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -3257,14 +3276,8 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
* fulfilled, i.e., bic can be redirected to new_bfqq
* and bfqq can be put.
*/
- bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
- new_bfqq);
- /*
- * If we get here, bio will be queued into new_queue,
- * so use new_bfqq to decide whether bio and rq can be
- * merged.
- */
- bfqq = new_bfqq;
+ while (bfqq != new_bfqq)
+ bfqq = bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq);
/*
* Change also bqfd->bio_bfqq, as
@@ -5463,40 +5476,42 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
}
}
+static void _bfq_exit_icq(struct bfq_io_cq *bic, unsigned int num_actuators)
+{
+ struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
+ unsigned int act_idx;
+
+ for (act_idx = 0; act_idx < num_actuators; act_idx++) {
+ if (bfqq_data[act_idx].stable_merge_bfqq)
+ bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
+
+ bfq_exit_icq_bfqq(bic, true, act_idx);
+ bfq_exit_icq_bfqq(bic, false, act_idx);
+ }
+}
+
static void bfq_exit_icq(struct io_cq *icq)
{
struct bfq_io_cq *bic = icq_to_bic(icq);
struct bfq_data *bfqd = bic_to_bfqd(bic);
unsigned long flags;
- unsigned int act_idx;
+
/*
* If bfqd and thus bfqd->num_actuators is not available any
* longer, then cycle over all possible per-actuator bfqqs in
* next loop. We rely on bic being zeroed on creation, and
* therefore on its unused per-actuator fields being NULL.
- */
- unsigned int num_actuators = BFQ_MAX_ACTUATORS;
- struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
-
- /*
+ *
* bfqd is NULL if scheduler already exited, and in that case
* this is the last time these queues are accessed.
*/
if (bfqd) {
spin_lock_irqsave(&bfqd->lock, flags);
- num_actuators = bfqd->num_actuators;
- }
-
- for (act_idx = 0; act_idx < num_actuators; act_idx++) {
- if (bfqq_data[act_idx].stable_merge_bfqq)
- bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
-
- bfq_exit_icq_bfqq(bic, true, act_idx);
- bfq_exit_icq_bfqq(bic, false, act_idx);
- }
-
- if (bfqd)
+ _bfq_exit_icq(bic, bfqd->num_actuators);
spin_unlock_irqrestore(&bfqd->lock, flags);
+ } else {
+ _bfq_exit_icq(bic, BFQ_MAX_ACTUATORS);
+ }
}
/*
@@ -5699,9 +5714,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* state before killing it.
*/
bfqq->bic = bic;
- bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
-
- return new_bfqq;
+ return bfq_merge_bfqqs(bfqd, bic, bfqq);
}
/*
@@ -6156,6 +6169,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
+ struct bfq_queue *old_bfqq = bfqq;
/*
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
@@ -6172,18 +6186,18 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* new_bfqq.
*/
if (bic_to_bfqq(RQ_BIC(rq), true,
- bfq_actuator_index(bfqd, rq->bio)) == bfqq)
- bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
- bfqq, new_bfqq);
+ bfq_actuator_index(bfqd, rq->bio)) == bfqq) {
+ while (bfqq != new_bfqq)
+ bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq);
+ }
- bfq_clear_bfqq_just_created(bfqq);
+ bfq_clear_bfqq_just_created(old_bfqq);
/*
* rq is about to be enqueued into new_bfqq,
* release rq reference on bfqq
*/
- bfq_put_queue(bfqq);
+ bfq_put_queue(old_bfqq);
rq->elv.priv[1] = new_bfqq;
- bfqq = new_bfqq;
}
bfq_update_io_thinktime(bfqd, bfqq);
@@ -6721,7 +6735,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
{
bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
- if (bfqq_process_refs(bfqq) == 1) {
+ if (bfqq_process_refs(bfqq) == 1 && !bfqq->new_bfqq) {
bfqq->pid = current->pid;
bfq_clear_bfqq_coop(bfqq);
bfq_clear_bfqq_split_coop(bfqq);
@@ -6736,11 +6750,10 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
return NULL;
}
-static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
- struct bfq_io_cq *bic,
- struct bio *bio,
- bool split, bool is_sync,
- bool *new_queue)
+static struct bfq_queue *
+__bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+ struct bio *bio, bool split, bool is_sync,
+ bool *new_queue)
{
unsigned int act_idx = bfq_actuator_index(bfqd, bio);
struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
@@ -6819,6 +6832,92 @@ static void bfq_prepare_request(struct request *rq)
rq->elv.priv[0] = rq->elv.priv[1] = NULL;
}
+static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq)
+{
+ struct bfq_queue *new_bfqq = bfqq->new_bfqq;
+ struct bfq_queue *waker_bfqq = bfqq->waker_bfqq;
+
+ if (!waker_bfqq)
+ return NULL;
+
+ while (new_bfqq) {
+ if (new_bfqq == waker_bfqq) {
+ /*
+ * If waker_bfqq is in the merge chain, and current
+ * is the only process, waker_bfqq can be freed.
+ */
+ if (bfqq_process_refs(waker_bfqq) == 1)
+ return NULL;
+
+ return waker_bfqq;
+ }
+
+ new_bfqq = new_bfqq->new_bfqq;
+ }
+
+ /*
+ * If waker_bfqq is not in the merge chain, and it's procress reference
+ * is 0, waker_bfqq can be freed.
+ */
+ if (bfqq_process_refs(waker_bfqq) == 0)
+ return NULL;
+
+ return waker_bfqq;
+}
+
+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bio *bio,
+ unsigned int idx,
+ bool is_sync)
+{
+ struct bfq_queue *waker_bfqq;
+ struct bfq_queue *bfqq;
+ bool new_queue = false;
+
+ bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
+ &new_queue);
+ if (unlikely(new_queue))
+ return bfqq;
+
+ /* If the queue was seeky for too long, break it apart. */
+ if (!bfq_bfqq_coop(bfqq) || !bfq_bfqq_split_coop(bfqq) ||
+ bic->bfqq_data[idx].stably_merged)
+ return bfqq;
+
+ waker_bfqq = bfq_waker_bfqq(bfqq);
+
+ /* Update bic before losing reference to bfqq */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bic->bfqq_data[idx].saved_in_large_burst = true;
+
+ bfqq = bfq_split_bfqq(bic, bfqq);
+ if (bfqq) {
+ bfq_bfqq_resume_state(bfqq, bfqd, bic, true);
+ return bfqq;
+ }
+
+ bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL);
+ if (unlikely(bfqq == &bfqd->oom_bfqq))
+ return bfqq;
+
+ bfq_bfqq_resume_state(bfqq, bfqd, bic, false);
+ bfqq->waker_bfqq = waker_bfqq;
+ bfqq->tentative_waker_bfqq = NULL;
+
+ /*
+ * If the waker queue disappears, then new_bfqq->waker_bfqq must be
+ * reset. So insert new_bfqq into the
+ * woken_list of the waker. See
+ * bfq_check_waker for details.
+ */
+ if (waker_bfqq)
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqq->waker_bfqq->woken_list);
+
+ return bfqq;
+}
+
/*
* If needed, init rq, allocate bfq data structures associated with
* rq, and increment reference counters in the destination bfq_queue
@@ -6850,8 +6949,6 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
struct bfq_io_cq *bic;
const int is_sync = rq_is_sync(rq);
struct bfq_queue *bfqq;
- bool new_queue = false;
- bool bfqq_already_existing = false, split = false;
unsigned int a_idx = bfq_actuator_index(bfqd, bio);
if (unlikely(!rq->elv.icq))
@@ -6868,54 +6965,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
return RQ_BFQQ(rq);
bic = icq_to_bic(rq->elv.icq);
-
bfq_check_ioprio_change(bic, bio);
-
bfq_bic_update_cgroup(bic, bio);
-
- bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
- &new_queue);
-
- if (likely(!new_queue)) {
- /* If the queue was seeky for too long, break it apart. */
- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
- !bic->bfqq_data[a_idx].stably_merged) {
- struct bfq_queue *old_bfqq = bfqq;
-
- /* Update bic before losing reference to bfqq */
- if (bfq_bfqq_in_large_burst(bfqq))
- bic->bfqq_data[a_idx].saved_in_large_burst =
- true;
-
- bfqq = bfq_split_bfqq(bic, bfqq);
- split = true;
-
- if (!bfqq) {
- bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
- true, is_sync,
- NULL);
- if (unlikely(bfqq == &bfqd->oom_bfqq))
- bfqq_already_existing = true;
- } else
- bfqq_already_existing = true;
-
- if (!bfqq_already_existing) {
- bfqq->waker_bfqq = old_bfqq->waker_bfqq;
- bfqq->tentative_waker_bfqq = NULL;
-
- /*
- * If the waker queue disappears, then
- * new_bfqq->waker_bfqq must be
- * reset. So insert new_bfqq into the
- * woken_list of the waker. See
- * bfq_check_waker for details.
- */
- if (bfqq->waker_bfqq)
- hlist_add_head(&bfqq->woken_list_node,
- &bfqq->waker_bfqq->woken_list);
- }
- }
- }
+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, a_idx, is_sync);
bfqq_request_allocated(bfqq);
bfqq->ref++;
@@ -6932,18 +6984,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
* addition, if the queue has also just been split, we have to
* resume its state.
*/
- if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+ if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq &&
+ bfqq_process_refs(bfqq) == 1)
bfqq->bic = bic;
- if (split) {
- /*
- * The queue has just been split from a shared
- * queue: restore the idle window and the
- * possible weight raising period.
- */
- bfq_bfqq_resume_state(bfqq, bfqd, bic,
- bfqq_already_existing);
- }
- }
/*
* Consider bfqq as possibly belonging to a burst of newly
@@ -7167,8 +7210,8 @@ static void bfq_exit_queue(struct elevator_queue *e)
#endif
blk_stat_disable_accounting(bfqd->queue);
- clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
- wbt_enable_default(bfqd->queue->disk);
+ blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue);
+ set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags);
kfree(bfqd);
}
@@ -7272,9 +7315,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
INIT_LIST_HEAD(&bfqd->dispatch);
- hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+ hrtimer_setup(&bfqd->idle_slice_timer, bfq_idle_slice_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
bfqd->queue_weights_tree = RB_ROOT_CACHED;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -7355,7 +7397,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
- set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
+ blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q);
wbt_disable_default(q->disk);
blk_stat_enable_accounting(q);
@@ -7579,7 +7621,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e,
#define BFQ_ATTR(name) \
__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
-static struct elv_fs_entry bfq_attrs[] = {
+static const struct elv_fs_entry bfq_attrs[] = {
BFQ_ATTR(fifo_expire_sync),
BFQ_ATTR(fifo_expire_async),
BFQ_ATTR(back_seek_max),
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 467e8cfc41a2..687a3a7ba784 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -1003,9 +1003,6 @@ struct bfq_group {
/* must be the first member */
struct blkg_policy_data pd;
- /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
- char blkg_path[128];
-
/* reference counter (see comments in bfq_bic_update_cgroup) */
refcount_t ref;
@@ -1159,6 +1156,8 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
+void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq,
+ struct bfq_queue *new_bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
@@ -1186,11 +1185,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
"%s " fmt, pid_str, ##args); \
} while (0)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
- blk_add_cgroup_trace_msg((bfqd)->queue, \
- &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \
-} while (0)
-
#else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
@@ -1200,7 +1194,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
} while (0)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
new file mode 100644
index 000000000000..9c6657664792
--- /dev/null
+++ b/block/bio-integrity-auto.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * Automatically generate and verify integrity data on PI capable devices if the
+ * bio submitter didn't provide PI itself. This ensures that kernel verifies
+ * data integrity even if the file system (or other user of the block device) is
+ * not aware of PI.
+ */
+#include <linux/blk-integrity.h>
+#include <linux/t10-pi.h>
+#include <linux/workqueue.h>
+#include "blk.h"
+
+struct bio_integrity_data {
+ struct bio *bio;
+ struct bvec_iter saved_bio_iter;
+ struct work_struct work;
+ struct bio_integrity_payload bip;
+ struct bio_vec bvec;
+};
+
+static struct kmem_cache *bid_slab;
+static mempool_t bid_pool;
+static struct workqueue_struct *kintegrityd_wq;
+
+static void bio_integrity_finish(struct bio_integrity_data *bid)
+{
+ bid->bio->bi_integrity = NULL;
+ bid->bio->bi_opf &= ~REQ_INTEGRITY;
+ kfree(bvec_virt(bid->bip.bip_vec));
+ mempool_free(bid, &bid_pool);
+}
+
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+ struct bio_integrity_data *bid =
+ container_of(work, struct bio_integrity_data, work);
+ struct bio *bio = bid->bio;
+
+ blk_integrity_verify_iter(bio, &bid->saved_bio_iter);
+ bio_integrity_finish(bid);
+ bio_endio(bio);
+}
+
+#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
+static bool bip_should_check(struct bio_integrity_payload *bip)
+{
+ return bip->bip_flags & BIP_CHECK_FLAGS;
+}
+
+static bool bi_offload_capable(struct blk_integrity *bi)
+{
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ return bi->tuple_size == sizeof(struct crc64_pi_tuple);
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ return bi->tuple_size == sizeof(struct t10_pi_tuple);
+ default:
+ pr_warn_once("%s: unknown integrity checksum type:%d\n",
+ __func__, bi->csum_type);
+ fallthrough;
+ case BLK_INTEGRITY_CSUM_NONE:
+ return false;
+ }
+}
+
+/**
+ * __bio_integrity_endio - Integrity I/O completion function
+ * @bio: Protected bio
+ *
+ * Normally I/O completion is done in interrupt context. However, verifying I/O
+ * integrity is a time-consuming task which must be run in process context.
+ *
+ * This function postpones completion accordingly.
+ */
+bool __bio_integrity_endio(struct bio *bio)
+{
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct bio_integrity_data *bid =
+ container_of(bip, struct bio_integrity_data, bip);
+
+ if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
+ bip_should_check(bip)) {
+ INIT_WORK(&bid->work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bid->work);
+ return false;
+ }
+
+ bio_integrity_finish(bid);
+ return true;
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ *
+ * Checks if the bio already has an integrity payload attached. If it does, the
+ * payload has been generated by another kernel subsystem, and we just pass it
+ * through.
+ * Otherwise allocates integrity payload and for writes the integrity metadata
+ * will be generated. For reads, the completion handler will verify the
+ * metadata.
+ */
+bool bio_integrity_prep(struct bio *bio)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ struct bio_integrity_data *bid;
+ bool set_flags = true;
+ gfp_t gfp = GFP_NOIO;
+ unsigned int len;
+ void *buf;
+
+ if (!bi)
+ return true;
+
+ if (!bio_sectors(bio))
+ return true;
+
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return true;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ if (bi->flags & BLK_INTEGRITY_NOVERIFY) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ }
+ break;
+ case REQ_OP_WRITE:
+ /*
+ * Zero the memory allocated to not leak uninitialized kernel
+ * memory to disk for non-integrity metadata where nothing else
+ * initializes the memory.
+ */
+ if (bi->flags & BLK_INTEGRITY_NOGENERATE) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ gfp |= __GFP_ZERO;
+ } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
+ gfp |= __GFP_ZERO;
+ break;
+ default:
+ return true;
+ }
+
+ if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
+ return true;
+
+ /* Allocate kernel buffer for protection data */
+ len = bio_integrity_bytes(bi, bio_sectors(bio));
+ buf = kmalloc(len, gfp);
+ if (!buf)
+ goto err_end_io;
+ bid = mempool_alloc(&bid_pool, GFP_NOIO);
+ if (!bid)
+ goto err_free_buf;
+ bio_integrity_init(bio, &bid->bip, &bid->bvec, 1);
+
+ bid->bio = bio;
+
+ bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY;
+ bip_set_seed(&bid->bip, bio->bi_iter.bi_sector);
+
+ if (set_flags) {
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
+ bid->bip.bip_flags |= BIP_IP_CHECKSUM;
+ if (bi->csum_type)
+ bid->bip.bip_flags |= BIP_CHECK_GUARD;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ bid->bip.bip_flags |= BIP_CHECK_REFTAG;
+ }
+
+ if (bio_integrity_add_page(bio, virt_to_page(buf), len,
+ offset_in_page(buf)) < len)
+ goto err_end_io;
+
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip))
+ blk_integrity_generate(bio);
+ else
+ bid->saved_bio_iter = bio->bi_iter;
+ return true;
+
+err_free_buf:
+ kfree(buf);
+err_end_io:
+ bio->bi_status = BLK_STS_RESOURCE;
+ bio_endio(bio);
+ return false;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+void blk_flush_integrity(void)
+{
+ flush_workqueue(kintegrityd_wq);
+}
+
+static int __init blk_integrity_auto_init(void)
+{
+ bid_slab = kmem_cache_create("bio_integrity_data",
+ sizeof(struct bio_integrity_data), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+ if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab))
+ panic("bio: can't create integrity pool\n");
+
+ /*
+ * kintegrityd won't block much but may burn a lot of CPU cycles.
+ * Make it highpri CPU intensive wq with max concurrency of 1.
+ */
+ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
+ if (!kintegrityd_wq)
+ panic("Failed to create kintegrityd\n");
+ return 0;
+}
+subsys_initcall(blk_integrity_auto_init);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 2e3e8e04961e..cb94e9be26dc 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -7,32 +7,36 @@
*/
#include <linux/blk-integrity.h>
-#include <linux/mempool.h>
-#include <linux/export.h>
-#include <linux/bio.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
#include "blk.h"
-static struct kmem_cache *bip_slab;
-static struct workqueue_struct *kintegrityd_wq;
+struct bio_integrity_alloc {
+ struct bio_integrity_payload bip;
+ struct bio_vec bvecs[];
+};
-void blk_flush_integrity(void)
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio: bio containing bip to be freed
+ *
+ * Description: Free the integrity portion of a bio.
+ */
+void bio_integrity_free(struct bio *bio)
{
- flush_workqueue(kintegrityd_wq);
+ kfree(bio_integrity(bio));
+ bio->bi_integrity = NULL;
+ bio->bi_opf &= ~REQ_INTEGRITY;
}
-static void __bio_integrity_free(struct bio_set *bs,
- struct bio_integrity_payload *bip)
+void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip,
+ struct bio_vec *bvecs, unsigned int nr_vecs)
{
- if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
- if (bip->bip_vec)
- bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
- bip->bip_max_vcnt);
- mempool_free(bip, &bs->bio_integrity_pool);
- } else {
- kfree(bip);
- }
+ memset(bip, 0, sizeof(*bip));
+ bip->bip_max_vcnt = nr_vecs;
+ if (nr_vecs)
+ bip->bip_vec = bvecs;
+
+ bio->bi_integrity = bip;
+ bio->bi_opf |= REQ_INTEGRITY;
}
/**
@@ -49,109 +53,61 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
gfp_t gfp_mask,
unsigned int nr_vecs)
{
- struct bio_integrity_payload *bip;
- struct bio_set *bs = bio->bi_pool;
- unsigned inline_vecs;
+ struct bio_integrity_alloc *bia;
if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
return ERR_PTR(-EOPNOTSUPP);
- if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
- bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask);
- inline_vecs = nr_vecs;
- } else {
- bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
- inline_vecs = BIO_INLINE_VECS;
- }
-
- if (unlikely(!bip))
+ bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask);
+ if (unlikely(!bia))
return ERR_PTR(-ENOMEM);
-
- memset(bip, 0, sizeof(*bip));
-
- /* always report as many vecs as asked explicitly, not inline vecs */
- bip->bip_max_vcnt = nr_vecs;
- if (nr_vecs > inline_vecs) {
- bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
- &bip->bip_max_vcnt, gfp_mask);
- if (!bip->bip_vec)
- goto err;
- } else {
- bip->bip_vec = bip->bip_inline_vecs;
- }
-
- bip->bip_bio = bio;
- bio->bi_integrity = bip;
- bio->bi_opf |= REQ_INTEGRITY;
-
- return bip;
-err:
- __bio_integrity_free(bs, bip);
- return ERR_PTR(-ENOMEM);
+ bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs);
+ return &bia->bip;
}
EXPORT_SYMBOL(bio_integrity_alloc);
-static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
- bool dirty)
+static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs)
{
int i;
- for (i = 0; i < nr_vecs; i++) {
- if (dirty && !PageCompound(bv[i].bv_page))
- set_page_dirty_lock(bv[i].bv_page);
+ for (i = 0; i < nr_vecs; i++)
unpin_user_page(bv[i].bv_page);
- }
}
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
{
- unsigned short nr_vecs = bip->bip_max_vcnt - 1;
- struct bio_vec *copy = &bip->bip_vec[1];
- size_t bytes = bip->bip_iter.bi_size;
- struct iov_iter iter;
+ unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1;
+ struct bio_vec *orig_bvecs = &bip->bip_vec[1];
+ struct bio_vec *bounce_bvec = &bip->bip_vec[0];
+ size_t bytes = bounce_bvec->bv_len;
+ struct iov_iter orig_iter;
int ret;
- iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
- ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+ iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes);
+ ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
WARN_ON_ONCE(ret != bytes);
- bio_integrity_unpin_bvec(copy, nr_vecs, true);
-}
-
-static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
-{
- bool dirty = bio_data_dir(bip->bip_bio) == READ;
-
- if (bip->bip_flags & BIP_COPY_USER) {
- if (dirty)
- bio_integrity_uncopy_user(bip);
- kfree(bvec_virt(bip->bip_vec));
- return;
- }
-
- bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty);
+ bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs);
}
/**
- * bio_integrity_free - Free bio integrity payload
- * @bio: bio containing bip to be freed
+ * bio_integrity_unmap_user - Unmap user integrity payload
+ * @bio: bio containing bip to be unmapped
*
- * Description: Used to free the integrity portion of a bio. Usually
- * called from bio_free().
+ * Unmap the user mapped integrity portion of a bio.
*/
-void bio_integrity_free(struct bio *bio)
+void bio_integrity_unmap_user(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct bio_set *bs = bio->bi_pool;
- if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
+ if (bip->bip_flags & BIP_COPY_USER) {
+ if (bio_data_dir(bio) == READ)
+ bio_integrity_uncopy_user(bip);
kfree(bvec_virt(bip->bip_vec));
- else if (bip->bip_flags & BIP_INTEGRITY_USER)
- bio_integrity_unmap_user(bip);
+ return;
+ }
- __bio_integrity_free(bs, bip);
- bio->bi_integrity = NULL;
- bio->bi_opf &= ~REQ_INTEGRITY;
+ bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt);
}
/**
@@ -169,16 +125,10 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct bio_integrity_payload *bip = bio_integrity(bio);
- if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) >
- queue_max_hw_sectors(q))
- return 0;
-
if (bip->bip_vcnt > 0) {
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
- bool same_page = false;
- if (bvec_try_merge_hw_page(q, bv, page, len, offset,
- &same_page)) {
+ if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
bip->bip_iter.bi_size += len;
return len;
}
@@ -205,7 +155,7 @@ EXPORT_SYMBOL(bio_integrity_add_page);
static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
int nr_vecs, unsigned int len,
- unsigned int direction, u32 seed)
+ unsigned int direction)
{
bool write = direction == ITER_SOURCE;
struct bio_integrity_payload *bip;
@@ -241,7 +191,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
}
if (write)
- bio_integrity_unpin_bvec(bvec, nr_vecs, false);
+ bio_integrity_unpin_bvec(bvec, nr_vecs);
else
memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
@@ -252,8 +202,8 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
goto free_bip;
}
- bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
- bip->bip_iter.bi_sector = seed;
+ bip->bip_flags |= BIP_COPY_USER;
+ bip->bip_vcnt = nr_vecs;
return 0;
free_bip:
bio_integrity_free(bio);
@@ -263,7 +213,7 @@ free_buf:
}
static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
- int nr_vecs, unsigned int len, u32 seed)
+ int nr_vecs, unsigned int len)
{
struct bio_integrity_payload *bip;
@@ -272,9 +222,8 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
return PTR_ERR(bip);
memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
- bip->bip_flags |= BIP_INTEGRITY_USER;
- bip->bip_iter.bi_sector = seed;
bip->bip_iter.bi_size = len;
+ bip->bip_vcnt = nr_vecs;
return 0;
}
@@ -308,17 +257,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
return nr_bvecs;
}
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
- u32 seed)
+int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
+ unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ size_t offset, bytes = iter->count;
unsigned int direction, nr_bvecs;
- struct iov_iter iter;
int ret, nr_vecs;
- size_t offset;
bool copy;
if (bio_integrity(bio))
@@ -331,8 +278,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
else
direction = ITER_SOURCE;
- iov_iter_ubuf(&iter, direction, ubuf, bytes);
- nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+ nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1);
if (nr_vecs > BIO_MAX_VECS)
return -E2BIG;
if (nr_vecs > UIO_FASTIOV) {
@@ -342,8 +288,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
pages = NULL;
}
- copy = !iov_iter_is_aligned(&iter, align, align);
- ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+ copy = !iov_iter_is_aligned(iter, align, align);
+ ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
if (unlikely(ret < 0))
goto free_bvec;
@@ -355,9 +301,9 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
if (copy)
ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
- direction, seed);
+ direction);
else
- ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
+ ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes);
if (ret)
goto release_pages;
if (bvec != stack_vec)
@@ -366,208 +312,60 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
return 0;
release_pages:
- bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
+ bio_integrity_unpin_bvec(bvec, nr_bvecs);
free_bvec:
if (bvec != stack_vec)
kfree(bvec);
return ret;
}
-EXPORT_SYMBOL_GPL(bio_integrity_map_user);
-/**
- * bio_integrity_process - Process integrity metadata for a bio
- * @bio: bio to generate/verify integrity metadata for
- * @proc_iter: iterator to process
- * @proc_fn: Pointer to the relevant processing function
- */
-static blk_status_t bio_integrity_process(struct bio *bio,
- struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
+static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta)
{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- struct blk_integrity_iter iter;
- struct bvec_iter bviter;
- struct bio_vec bv;
struct bio_integrity_payload *bip = bio_integrity(bio);
- blk_status_t ret = BLK_STS_OK;
- iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
- iter.interval = 1 << bi->interval_exp;
- iter.tuple_size = bi->tuple_size;
- iter.seed = proc_iter->bi_sector;
- iter.prot_buf = bvec_virt(bip->bip_vec);
- iter.pi_offset = bi->pi_offset;
+ if (meta->flags & IO_INTEGRITY_CHK_GUARD)
+ bip->bip_flags |= BIP_CHECK_GUARD;
+ if (meta->flags & IO_INTEGRITY_CHK_APPTAG)
+ bip->bip_flags |= BIP_CHECK_APPTAG;
+ if (meta->flags & IO_INTEGRITY_CHK_REFTAG)
+ bip->bip_flags |= BIP_CHECK_REFTAG;
- __bio_for_each_segment(bv, bio, bviter, *proc_iter) {
- void *kaddr = bvec_kmap_local(&bv);
-
- iter.data_buf = kaddr;
- iter.data_size = bv.bv_len;
- ret = proc_fn(&iter);
- kunmap_local(kaddr);
-
- if (ret)
- break;
-
- }
- return ret;
+ bip->app_tag = meta->app_tag;
}
-/**
- * bio_integrity_prep - Prepare bio for integrity I/O
- * @bio: bio to prepare
- *
- * Description: Checks if the bio already has an integrity payload attached.
- * If it does, the payload has been generated by another kernel subsystem,
- * and we just pass it through. Otherwise allocates integrity payload.
- * The bio must have data direction, target device and start sector set priot
- * to calling. In the WRITE case, integrity metadata will be generated using
- * the block device's integrity function. In the READ case, the buffer
- * will be prepared for DMA and a suitable end_io handler set up.
- */
-bool bio_integrity_prep(struct bio *bio)
+int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
{
- struct bio_integrity_payload *bip;
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- void *buf;
- unsigned long start, end;
- unsigned int len, nr_pages;
- unsigned int bytes, offset, i;
+ unsigned int integrity_bytes;
+ int ret;
+ struct iov_iter it;
if (!bi)
- return true;
-
- if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
- return true;
-
- if (!bio_sectors(bio))
- return true;
-
- /* Already protected? */
- if (bio_integrity(bio))
- return true;
-
- if (bio_data_dir(bio) == READ) {
- if (!bi->profile->verify_fn ||
- !(bi->flags & BLK_INTEGRITY_VERIFY))
- return true;
- } else {
- if (!bi->profile->generate_fn ||
- !(bi->flags & BLK_INTEGRITY_GENERATE))
- return true;
- }
-
- /* Allocate kernel buffer for protection data */
- len = bio_integrity_bytes(bi, bio_sectors(bio));
- buf = kmalloc(len, GFP_NOIO);
- if (unlikely(buf == NULL)) {
- printk(KERN_ERR "could not allocate integrity buffer\n");
- goto err_end_io;
- }
-
- end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ((unsigned long) buf) >> PAGE_SHIFT;
- nr_pages = end - start;
-
- /* Allocate bio integrity payload and integrity vectors */
- bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
- if (IS_ERR(bip)) {
- printk(KERN_ERR "could not allocate data integrity bioset\n");
- kfree(buf);
- goto err_end_io;
- }
-
- bip->bip_flags |= BIP_BLOCK_INTEGRITY;
- bip_set_seed(bip, bio->bi_iter.bi_sector);
-
- if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
- bip->bip_flags |= BIP_IP_CHECKSUM;
-
- /* Map it */
- offset = offset_in_page(buf);
- for (i = 0; i < nr_pages && len > 0; i++) {
- bytes = PAGE_SIZE - offset;
-
- if (bytes > len)
- bytes = len;
-
- if (bio_integrity_add_page(bio, virt_to_page(buf),
- bytes, offset) < bytes) {
- printk(KERN_ERR "could not attach integrity payload\n");
- goto err_end_io;
- }
-
- buf += bytes;
- len -= bytes;
- offset = 0;
- }
-
- /* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE) {
- bio_integrity_process(bio, &bio->bi_iter,
- bi->profile->generate_fn);
- } else {
- bip->bio_iter = bio->bi_iter;
- }
- return true;
-
-err_end_io:
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return false;
-}
-EXPORT_SYMBOL(bio_integrity_prep);
-
-/**
- * bio_integrity_verify_fn - Integrity I/O completion worker
- * @work: Work struct stored in bio to be verified
- *
- * Description: This workqueue function is called to complete a READ
- * request. The function verifies the transferred integrity metadata
- * and then calls the original bio end_io function.
- */
-static void bio_integrity_verify_fn(struct work_struct *work)
-{
- struct bio_integrity_payload *bip =
- container_of(work, struct bio_integrity_payload, bip_work);
- struct bio *bio = bip->bip_bio;
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
-
+ return -EINVAL;
/*
- * At the moment verify is called bio's iterator was advanced
- * during split and completion, we need to rewind iterator to
- * it's original position.
+ * original meta iterator can be bigger.
+ * process integrity info corresponding to current data buffer only.
*/
- bio->bi_status = bio_integrity_process(bio, &bip->bio_iter,
- bi->profile->verify_fn);
- bio_integrity_free(bio);
- bio_endio(bio);
-}
+ it = meta->iter;
+ integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio));
+ if (it.count < integrity_bytes)
+ return -EINVAL;
-/**
- * __bio_integrity_endio - Integrity I/O completion function
- * @bio: Protected bio
- *
- * Description: Completion for integrity I/O
- *
- * Normally I/O completion is done in interrupt context. However,
- * verifying I/O integrity is a time-consuming task which must be run
- * in process context. This function postpones completion
- * accordingly.
- */
-bool __bio_integrity_endio(struct bio *bio)
-{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
- struct bio_integrity_payload *bip = bio_integrity(bio);
+ /* should fit into two bytes */
+ BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16));
- if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
- (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) {
- INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
- queue_work(kintegrityd_wq, &bip->bip_work);
- return false;
- }
+ if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS))
+ return -EINVAL;
- bio_integrity_free(bio);
- return true;
+ it.count = integrity_bytes;
+ ret = bio_integrity_map_user(bio, &it);
+ if (!ret) {
+ bio_uio_meta_to_bip(bio, meta);
+ bip_set_seed(bio_integrity(bio), meta->seed);
+ iov_iter_advance(&meta->iter, integrity_bytes);
+ meta->seed += bio_integrity_intervals(bi, bio_sectors(bio));
+ }
+ return ret;
}
/**
@@ -620,57 +418,14 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
BUG_ON(bip_src == NULL);
- bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
+ bip = bio_integrity_alloc(bio, gfp_mask, 0);
if (IS_ERR(bip))
return PTR_ERR(bip);
- memcpy(bip->bip_vec, bip_src->bip_vec,
- bip_src->bip_vcnt * sizeof(struct bio_vec));
-
- bip->bip_vcnt = bip_src->bip_vcnt;
+ bip->bip_vec = bip_src->bip_vec;
bip->bip_iter = bip_src->bip_iter;
- bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;
+ bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS;
+ bip->app_tag = bip_src->app_tag;
return 0;
}
-
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
-{
- if (mempool_initialized(&bs->bio_integrity_pool))
- return 0;
-
- if (mempool_init_slab_pool(&bs->bio_integrity_pool,
- pool_size, bip_slab))
- return -1;
-
- if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
- mempool_exit(&bs->bio_integrity_pool);
- return -1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
-
-void bioset_integrity_free(struct bio_set *bs)
-{
- mempool_exit(&bs->bio_integrity_pool);
- mempool_exit(&bs->bvec_integrity_pool);
-}
-
-void __init bio_integrity_init(void)
-{
- /*
- * kintegrityd won't block much but may burn a lot of CPU cycles.
- * Make it highpri CPU intensive wq with max concurrency of 1.
- */
- kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
- WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
- if (!kintegrityd_wq)
- panic("Failed to create kintegrityd\n");
-
- bip_slab = kmem_cache_create("bio_integrity_payload",
- sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * BIO_INLINE_VECS,
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-}
diff --git a/block/bio.c b/block/bio.c
index d24420ed1c4c..3c0a558c90f5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -4,7 +4,7 @@
*/
#include <linux/mm.h>
#include <linux/swap.h>
-#include <linux/bio.h>
+#include <linux/bio-integrity.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/iocontext.h>
@@ -77,7 +77,7 @@ struct bio_slab {
struct kmem_cache *slab;
unsigned int slab_ref;
unsigned int slab_size;
- char name[8];
+ char name[12];
};
static DEFINE_MUTEX(bio_slab_lock);
static DEFINE_XARRAY(bio_slabs);
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
+ bio->bi_write_stream = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -345,18 +346,29 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
-struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
- unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+/**
+ * bio_chain_and_submit - submit a bio after chaining it to another one
+ * @prev: bio to chain and submit
+ * @new: bio to chain to
+ *
+ * If @prev is non-NULL, chain it to @new and submit it.
+ *
+ * Return: @new.
+ */
+struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
{
- struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
-
- if (bio) {
- bio_chain(bio, new);
- submit_bio(bio);
+ if (prev) {
+ bio_chain(prev, new);
+ submit_bio(prev);
}
-
return new;
}
+
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+{
+ return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
+}
EXPORT_SYMBOL_GPL(blk_next_bio);
static void bio_alloc_rescue(struct work_struct *work)
@@ -600,7 +612,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
{
struct bio *bio;
- if (nr_vecs > UIO_MAXIOV)
+ if (nr_vecs > BIO_MAX_INLINE_VECS)
return NULL;
return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
}
@@ -816,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
@@ -907,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
}
static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
- unsigned int len, unsigned int off, bool *same_page)
+ unsigned int len, unsigned int off)
{
size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@@ -920,8 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
- *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
- if (!*same_page) {
+ if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
return false;
if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
@@ -934,133 +946,25 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
/*
* Try to merge a page into a segment, while obeying the hardware segment
- * size limit. This is not for normal read/write bios, but for passthrough
- * or Zone Append operations that we can't split.
+ * size limit.
+ *
+ * This is kept around for the integrity metadata, which is still tries
+ * to build the initial bio to the hardware limit and doesn't have proper
+ * helpers to split. Hopefully this will go away soon.
*/
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page)
+ struct page *page, unsigned len, unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
- phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
+ phys_addr_t addr1 = bvec_phys(bv);
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
if ((addr1 | mask) != (addr2 | mask))
return false;
if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
- return bvec_try_merge_page(bv, page, len, offset, same_page);
-}
-
-/**
- * bio_add_hw_page - attempt to add a page to a bio with hw constraints
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- * @max_sectors: maximum number of sectors that can be added
- * @same_page: return if the segment has been merged inside the same page
- *
- * Add a page to a bio while respecting the hardware max_sectors, max_segment
- * and gap limitations.
- */
-int bio_add_hw_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors, bool *same_page)
-{
- unsigned int max_size = max_sectors << SECTOR_SHIFT;
-
- if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
- return 0;
-
- len = min3(len, max_size, queue_max_segment_size(q));
- if (len > max_size - bio->bi_iter.bi_size)
- return 0;
-
- if (bio->bi_vcnt > 0) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (bvec_try_merge_hw_page(q, bv, page, len, offset,
- same_page)) {
- bio->bi_iter.bi_size += len;
- return len;
- }
-
- if (bio->bi_vcnt >=
- min(bio->bi_max_vecs, queue_max_segments(q)))
- return 0;
-
- /*
- * If the queue doesn't support SG gaps and adding this segment
- * would create a gap, disallow it.
- */
- if (bvec_gap_to_prev(&q->limits, bv, offset))
- return 0;
- }
-
- bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
- bio->bi_vcnt++;
- bio->bi_iter.bi_size += len;
- return len;
-}
-
-/**
- * bio_add_pc_page - attempt to add page to passthrough bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- *
- * This should only be used by passthrough bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned int len, unsigned int offset)
-{
- bool same_page = false;
- return bio_add_hw_page(q, bio, page, len, offset,
- queue_max_hw_sectors(q), &same_page);
+ return bvec_try_merge_page(bv, page, len, offset);
}
-EXPORT_SYMBOL(bio_add_pc_page);
-
-/**
- * bio_add_zone_append_page - attempt to add page to zone-append bio
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
- * for a zone-append request. This can fail for a number of reasons, such as the
- * bio being full or the target block device is not a zoned block device or
- * other limitations of the target block device. The target block device must
- * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
- * to an empty bio.
- *
- * Returns: number of bytes added to the bio, or 0 in case of a failure.
- */
-int bio_add_zone_append_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- bool same_page = false;
-
- if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
- return 0;
-
- if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
- return 0;
-
- return bio_add_hw_page(q, bio, page, len, offset,
- queue_max_zone_append_sectors(q), &same_page);
-}
-EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
/**
* __bio_add_page - add page(s) to a bio in a new segment
@@ -1085,6 +989,22 @@ void __bio_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL_GPL(__bio_add_page);
/**
+ * bio_add_virt_nofail - add data in the direct kernel mapping to a bio
+ * @bio: destination bio
+ * @vaddr: data to add
+ * @len: length of the data to add, may cross pages
+ *
+ * Add the data at @vaddr to @bio. The caller must have ensure a segment
+ * is available for the added data. No merging into an existing segment
+ * will be performed.
+ */
+void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
+{
+ __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
+}
+EXPORT_SYMBOL_GPL(bio_add_virt_nofail);
+
+/**
* bio_add_page - attempt to add page(s) to bio
* @bio: destination bio
* @page: start page to add
@@ -1097,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- bool same_page = false;
-
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (bio->bi_iter.bi_size > UINT_MAX - len)
@@ -1106,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset, &same_page)) {
+ page, len, offset)) {
bio->bi_iter.bi_size += len;
return len;
}
@@ -1121,10 +1039,12 @@ EXPORT_SYMBOL(bio_add_page);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
+ unsigned long nr = off / PAGE_SIZE;
+
WARN_ON_ONCE(len > UINT_MAX);
- WARN_ON_ONCE(off > UINT_MAX);
- __bio_add_page(bio, &folio->page, len, off);
+ __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE);
}
+EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
/**
* bio_add_folio - Attempt to add part of a folio to a bio.
@@ -1143,18 +1063,74 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
- if (len > UINT_MAX || off > UINT_MAX)
+ unsigned long nr = off / PAGE_SIZE;
+
+ if (len > UINT_MAX)
return false;
- return bio_add_page(bio, &folio->page, len, off) > 0;
+ return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0;
}
EXPORT_SYMBOL(bio_add_folio);
+/**
+ * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio and return how many bytes were added.
+ * This may be less than the amount originally asked. Returns 0 if no data
+ * could be added to @bio.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
+{
+ unsigned int offset = offset_in_page(vaddr);
+
+ len = min(len, PAGE_SIZE - offset);
+ if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
+ return 0;
+ if (op_is_write(bio_op(bio)))
+ flush_kernel_vmap_range(vaddr, len);
+ return len;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);
+
+/**
+ * bio_add_vmalloc - add a vmalloc region to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio. Return %true on success or %false if
+ * @bio does not have enough space for the payload.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
+{
+ do {
+ unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);
+
+ if (!added)
+ return false;
+ vaddr += added;
+ len -= added;
+ } while (len);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc);
+
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct folio_iter fi;
bio_for_each_folio_all(fi, bio) {
- struct page *page;
size_t nr_pages;
if (mark_dirty) {
@@ -1162,68 +1138,52 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_mark_dirty(fi.folio);
folio_unlock(fi.folio);
}
- page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
fi.offset / PAGE_SIZE + 1;
- do {
- bio_release_page(bio, page++);
- } while (--nr_pages != 0);
+ unpin_user_folio(fi.folio, nr_pages);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
-void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
{
- size_t size = iov_iter_count(iter);
-
WARN_ON_ONCE(bio->bi_max_vecs);
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- size_t max_sectors = queue_max_zone_append_sectors(q);
-
- size = min(size, max_sectors << SECTOR_SHIFT);
- }
-
bio->bi_vcnt = iter->nr_segs;
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset;
- bio->bi_iter.bi_size = size;
+ bio->bi_iter.bi_size = iov_iter_count(iter);
bio_set_flag(bio, BIO_CLONED);
}
-static int bio_iov_add_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
+static unsigned int get_contig_folio_len(unsigned int *num_pages,
+ struct page **pages, unsigned int i,
+ struct folio *folio, size_t left,
+ size_t offset)
{
- bool same_page = false;
+ size_t bytes = left;
+ size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
+ unsigned int j;
- if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
- return -EIO;
+ /*
+ * We might COW a single page in the middle of
+ * a large folio, so we have to check that all
+ * pages belong to the same folio.
+ */
+ bytes -= contig_sz;
+ for (j = i + 1; j < i + *num_pages; j++) {
+ size_t next = min_t(size_t, PAGE_SIZE, bytes);
- if (bio->bi_vcnt > 0 &&
- bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset, &same_page)) {
- bio->bi_iter.bi_size += len;
- if (same_page)
- bio_release_page(bio, page);
- return 0;
+ if (page_folio(pages[j]) != folio ||
+ pages[j] != pages[j - 1] + 1) {
+ break;
+ }
+ contig_sz += next;
+ bytes -= next;
}
- __bio_add_page(bio, page, len, offset);
- return 0;
-}
+ *num_pages = j - i;
-static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- bool same_page = false;
-
- if (bio_add_hw_page(q, bio, page, len, offset,
- queue_max_zone_append_sectors(q), &same_page) != len)
- return -EINVAL;
- if (same_page)
- bio_release_page(bio, page);
- return 0;
+ return contig_sz;
}
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
@@ -1245,9 +1205,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- ssize_t size, left;
- unsigned len, i = 0;
- size_t offset;
+ ssize_t size;
+ unsigned int num_pages, i = 0;
+ size_t offset, folio_offset, left, len;
int ret = 0;
/*
@@ -1287,18 +1247,39 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
goto out;
}
- for (left = size, i = 0; left > 0; left -= len, i++) {
+ for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
+ struct folio *folio = page_folio(page);
+ unsigned int old_vcnt = bio->bi_vcnt;
- len = min_t(size_t, PAGE_SIZE - offset, left);
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- ret = bio_iov_add_zone_append_page(bio, page, len,
- offset);
- if (ret)
- break;
- } else
- bio_iov_add_page(bio, page, len, offset);
+ folio_offset = ((size_t)folio_page_idx(folio, page) <<
+ PAGE_SHIFT) + offset;
+ len = min(folio_size(folio) - folio_offset, left);
+
+ num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+
+ if (num_pages > 1)
+ len = get_contig_folio_len(&num_pages, pages, i,
+ folio, left, offset);
+
+ if (!bio_add_folio(bio, folio, len, folio_offset)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (bio_flagged(bio, BIO_PAGE_PINNED)) {
+ /*
+ * We're adding another fragment of a page that already
+ * was part of the last segment. Undo our pin as the
+ * page was pinned when an earlier fragment of it was
+ * added to the bio and __bio_release_pages expects a
+ * single pin per page.
+ */
+ if (offset && bio->bi_vcnt == old_vcnt)
+ unpin_user_folio(folio, 1);
+ }
offset = 0;
}
@@ -1384,6 +1365,56 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
+/**
+ * bdev_rw_virt - synchronously read into / write from kernel mapping
+ * @bdev: block device to access
+ * @sector: sector to access
+ * @data: data to read/write
+ * @len: length in byte to read/write
+ * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
+ *
+ * Performs synchronous I/O to @bdev for @data/@len. @data must be in
+ * the kernel direct mapping and not a vmalloc address.
+ */
+int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
+ size_t len, enum req_op op)
+{
+ struct bio_vec bv;
+ struct bio bio;
+ int error;
+
+ if (WARN_ON_ONCE(is_vmalloc_addr(data)))
+ return -EIO;
+
+ bio_init(&bio, bdev, &bv, 1, op);
+ bio.bi_iter.bi_sector = sector;
+ bio_add_virt_nofail(&bio, data, len);
+ error = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+ return error;
+}
+EXPORT_SYMBOL_GPL(bdev_rw_virt);
+
+static void bio_wait_end_io(struct bio *bio)
+{
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * bio_await_chain - ends @bio and waits for every chained bio to complete
+ */
+void bio_await_chain(struct bio *bio)
+{
+ DECLARE_COMPLETION_ONSTACK_MAP(done,
+ bio->bi_bdev->bd_disk->lockdep_map);
+
+ bio->bi_private = &done;
+ bio->bi_end_io = bio_wait_end_io;
+ bio_endio(bio);
+ blk_wait_io(&done);
+}
+
void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
@@ -1576,6 +1607,8 @@ again:
if (!bio_integrity_endio(bio))
return;
+ blk_zone_bio_endio(bio);
+
rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
@@ -1596,9 +1629,18 @@ again:
goto again;
}
- blk_throtl_bio_endio(bio);
- /* release cgroup info */
- bio_uninit(bio);
+#ifdef CONFIG_BLK_CGROUP
+ /*
+ * Release cgroup info. We shouldn't have to do this here, but quite
+ * a few callers of bio_init fail to call bio_uninit, so we cover up
+ * for that here at least for now.
+ */
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
+#endif
+
if (bio->bi_end_io)
bio->bi_end_io(bio);
}
@@ -1623,16 +1665,22 @@ struct bio *bio_split(struct bio *bio, int sectors,
{
struct bio *split;
- BUG_ON(sectors <= 0);
- BUG_ON(sectors >= bio_sectors(bio));
+ if (WARN_ON_ONCE(sectors <= 0))
+ return ERR_PTR(-EINVAL);
+ if (WARN_ON_ONCE(sectors >= bio_sectors(bio)))
+ return ERR_PTR(-EINVAL);
/* Zone append commands cannot be split */
if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
- return NULL;
+ return ERR_PTR(-EINVAL);
+
+ /* atomic writes cannot be split */
+ if (bio->bi_opf & REQ_ATOMIC)
+ return ERR_PTR(-EINVAL);
split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
if (!split)
- return NULL;
+ return ERR_PTR(-ENOMEM);
split->bi_iter.bi_size = sectors << 9;
@@ -1659,6 +1707,10 @@ EXPORT_SYMBOL(bio_split);
*/
void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
+ /* We should never trim an atomic write */
+ if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size))
+ return;
+
if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
offset + size > bio_sectors(bio)))
return;
@@ -1702,7 +1754,6 @@ void bioset_exit(struct bio_set *bs)
mempool_exit(&bs->bio_pool);
mempool_exit(&bs->bvec_pool);
- bioset_integrity_free(bs);
if (bs->bio_slab)
bio_put_slab(bs);
bs->bio_slab = NULL;
@@ -1782,8 +1833,6 @@ static int __init init_bio(void)
BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
- bio_integrity_init();
-
for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
struct biovec_slab *bvs = bvec_slabs + i;
@@ -1799,9 +1848,6 @@ static int __init init_bio(void)
BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
panic("bio: can't allocate bios\n");
- if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
- panic("bio: can't create integrity pool\n");
-
return 0;
}
subsys_initcall(init_bio);
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index 3304e841df7c..a55fb0c53558 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
{
int i, ret;
- for (i = 0; i < BLKG_RWSTAT_NR; i++) {
- ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
- if (ret) {
- while (--i >= 0)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
- return ret;
- }
+ ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
atomic64_set(&rwstat->aux_cnt[i], 0);
- }
return 0;
}
EXPORT_SYMBOL_GPL(blkg_rwstat_init);
void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
{
- int i;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR);
}
EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index 022527b0b043..703a16fe1404 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -52,7 +52,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
/**
* blkg_rwstat_add - add a value to a blkg_rwstat
* @rwstat: target blkg_rwstat
- * @op: REQ_OP and flags
+ * @opf: REQ_OP and flags
* @val: value to add
*
* Add @val to @rwstat. The counters are chosen according to @rw. The
@@ -83,8 +83,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
/**
* blkg_rwstat_read - read the current values of a blkg_rwstat
* @rwstat: blkg_rwstat to read
+ * @result: where to put the current values
*
- * Read the current snapshot of @rwstat and return it in the aux counts.
+ * Read the current snapshot of @rwstat and return it in the @result counts.
*/
static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
struct blkg_rwstat_sample *result)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 059467086b13..5936db7f8475 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work)
/* as long as there are pending bios, @blkg can't go away */
spin_lock(&blkg->async_bio_lock);
- bio_list_merge(&bios, &blkg->async_bios);
- bio_list_init(&blkg->async_bios);
+ bio_list_merge_init(&bios, &blkg->async_bios);
spin_unlock(&blkg->async_bio_lock);
/* start plug only when bio_list contains at least 2 bios */
@@ -323,6 +322,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
blkg->q = disk->queue;
INIT_LIST_HEAD(&blkg->q_node);
blkg->blkcg = blkcg;
+ blkg->iostat.blkg = blkg;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
spin_lock_init(&blkg->async_bio_lock);
bio_list_init(&blkg->async_bios);
@@ -619,13 +619,47 @@ restart:
spin_unlock_irq(&q->queue_lock);
}
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] = src->bytes[i];
+ dst->ios[i] = src->ios[i];
+ }
+}
+
+static void __blkg_clear_stat(struct blkg_iostat_set *bis)
+{
+ struct blkg_iostat cur = {0};
+ unsigned long flags;
+
+ flags = u64_stats_update_begin_irqsave(&bis->sync);
+ blkg_iostat_set(&bis->cur, &cur);
+ blkg_iostat_set(&bis->last, &cur);
+ u64_stats_update_end_irqrestore(&bis->sync, flags);
+}
+
+static void blkg_clear_stat(struct blkcg_gq *blkg)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);
+
+ __blkg_clear_stat(s);
+ }
+ __blkg_clear_stat(&blkg->iostat);
+}
+
static int blkcg_reset_stats(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 val)
{
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
- int i, cpu;
+ int i;
+ pr_info_once("blkio.%s is deprecated\n", cftype->name);
mutex_lock(&blkcg_pol_mutex);
spin_lock_irq(&blkcg->lock);
@@ -635,18 +669,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
* anyway. If you get hit by a race, retry.
*/
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
- for_each_possible_cpu(cpu) {
- struct blkg_iostat_set *bis =
- per_cpu_ptr(blkg->iostat_cpu, cpu);
- memset(bis, 0, sizeof(*bis));
-
- /* Re-initialize the cleared blkg_iostat_set */
- u64_stats_init(&bis->sync);
- bis->blkg = blkg;
- }
- memset(&blkg->iostat, 0, sizeof(blkg->iostat));
- u64_stats_init(&blkg->iostat.sync);
-
+ blkg_clear_stat(blkg);
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -774,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
return -EINVAL;
input = skip_spaces(input);
- bdev = blkdev_get_no_open(MKDEV(major, minor));
+ bdev = blkdev_get_no_open(MKDEV(major, minor), false);
if (!bdev)
return -ENODEV;
if (bdev_is_partition(bdev)) {
@@ -793,6 +816,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
ctx->bdev = bdev;
return 0;
}
+/*
+ * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
+ * acquires q->elevator_lock, and ensures the correct locking order
+ * between q->elevator_lock and q->rq_qos_mutex.
+ *
+ * This function returns negative error on failure. On success it returns
+ * memflags which must be saved and later passed to blkg_conf_exit_frozen
+ * for restoring the memalloc scope.
+ */
+unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
+{
+ int ret;
+ unsigned long memflags;
+
+ if (ctx->bdev)
+ return -EINVAL;
+
+ ret = blkg_conf_open_bdev(ctx);
+ if (ret < 0)
+ return ret;
+ /*
+ * At this point, we haven’t started protecting anything related to QoS,
+ * so we release q->rq_qos_mutex here, which was first acquired in blkg_
+ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
+ * the queue and acquiring q->elevator_lock to maintain the correct
+ * locking order.
+ */
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
+
+ memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
+ mutex_lock(&ctx->bdev->bd_queue->elevator_lock);
+ mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);
+
+ return memflags;
+}
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
@@ -949,13 +1007,19 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);
-static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+/*
+ * Similar to blkg_conf_exit, but also unfreezes the queue and releases
+ * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen
+ * is used to open the bdev.
+ */
+void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
{
- int i;
+ if (ctx->bdev) {
+ struct request_queue *q = ctx->bdev->bd_queue;
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] = src->bytes[i];
- dst->ios[i] = src->ios[i];
+ blkg_conf_exit(ctx);
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
}
}
@@ -1010,8 +1074,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
/*
* For covering concurrent parent blkg update from blkg_release().
*
- * When flushing from cgroup, cgroup_rstat_lock is always held, so
- * this lock won't cause contention most of time.
+ * When flushing from cgroup, the subsystem rstat lock is always held,
+ * so this lock won't cause contention most of time.
*/
raw_spin_lock_irqsave(&blkg_stat_lock, flags);
@@ -1024,7 +1088,19 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
struct blkg_iostat cur;
unsigned int seq;
+ /*
+ * Order assignment of `next_bisc` from `bisc->lnode.next` in
+ * llist_for_each_entry_safe and clearing `bisc->lqueued` for
+ * avoiding to assign `next_bisc` with new next pointer added
+ * in blk_cgroup_bio_start() in case of re-ordering.
+ *
+ * The pair barrier is implied in llist_add() in blk_cgroup_bio_start().
+ */
+ smp_mb();
+
WRITE_ONCE(bisc->lqueued, false);
+ if (bisc == &blkg->iostat)
+ goto propagate_up; /* propagate up to parent only */
/* fetch the current per-cpu values */
do {
@@ -1034,10 +1110,24 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
blkcg_iostat_update(blkg, &cur, &bisc->last);
+propagate_up:
/* propagate global delta to parent (unless that's root) */
- if (parent && parent->parent)
+ if (parent && parent->parent) {
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
+ /*
+ * Queue parent->iostat to its blkcg's lockless
+ * list to propagate up to the grandparent if the
+ * iostat hasn't been queued yet.
+ */
+ if (!parent->iostat.lqueued) {
+ struct llist_head *plhead;
+
+ plhead = per_cpu_ptr(parent->blkcg->lhead, cpu);
+ llist_add(&parent->iostat.lnode, plhead);
+ parent->iostat.lqueued = true;
+ }
+ }
}
raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
out:
@@ -1054,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
/*
* We source root cgroup stats from the system-wide stats to avoid
* tracking the same information twice and incurring overhead when no
- * cgroups are defined. For that reason, cgroup_rstat_flush in
+ * cgroups are defined. For that reason, css_rstat_flush in
* blkcg_print_stat does not actually fill out the iostat in the root
* cgroup's blkcg_gq.
*
@@ -1100,6 +1190,7 @@ static void blkcg_fill_root_iostats(void)
blkg_iostat_set(&blkg->iostat.cur, &tmp);
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
+ class_dev_iter_exit(&iter);
}
static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
@@ -1162,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
if (!seq_css(sf)->parent)
blkcg_fill_root_iostats();
else
- cgroup_rstat_flush(blkcg->css.cgroup);
+ css_rstat_flush(&blkcg->css);
rcu_read_lock();
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
@@ -1286,10 +1377,14 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
struct blkcg *blkcg = css_to_blkcg(blkcg_css);
do {
+ struct blkcg *parent;
+
if (!refcount_dec_and_test(&blkcg->online_pin))
break;
+
+ parent = blkcg_parent(blkcg);
blkcg_destroy_blkgs(blkcg);
- blkcg = blkcg_parent(blkcg);
+ blkcg = parent;
} while (blkcg);
}
@@ -1420,7 +1515,6 @@ int blkcg_init_disk(struct gendisk *disk)
struct request_queue *q = disk->queue;
struct blkcg_gq *new_blkg, *blkg;
bool preloaded;
- int ret;
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
@@ -1440,21 +1534,8 @@ int blkcg_init_disk(struct gendisk *disk)
if (preloaded)
radix_tree_preload_end();
- ret = blk_ioprio_init(disk);
- if (ret)
- goto err_destroy_all;
-
- ret = blk_throtl_init(disk);
- if (ret)
- goto err_ioprio_exit;
-
return 0;
-err_ioprio_exit:
- blk_ioprio_exit(disk);
-err_destroy_all:
- blkg_destroy_all(disk);
- return ret;
err_unlock:
spin_unlock_irq(&q->queue_lock);
if (preloaded)
@@ -1517,13 +1598,22 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
struct request_queue *q = disk->queue;
struct blkg_policy_data *pd_prealloc = NULL;
struct blkcg_gq *blkg, *pinned_blkg = NULL;
+ unsigned int memflags;
int ret;
if (blkcg_policy_enabled(q, pol))
return 0;
+ /*
+ * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn,
+ * for example, ioprio. Such policy will work on blkcg level, not disk
+ * level, and don't need to be activated.
+ */
+ if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
+ return -EINVAL;
+
if (queue_is_mq(q))
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
retry:
spin_lock_irq(&q->queue_lock);
@@ -1587,7 +1677,7 @@ retry:
spin_unlock_irq(&q->queue_lock);
out:
if (queue_is_mq(q))
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
if (pinned_blkg)
blkg_put(pinned_blkg);
if (pd_prealloc)
@@ -1631,12 +1721,13 @@ void blkcg_deactivate_policy(struct gendisk *disk,
{
struct request_queue *q = disk->queue;
struct blkcg_gq *blkg;
+ unsigned int memflags;
if (!blkcg_policy_enabled(q, pol))
return;
if (queue_is_mq(q))
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
@@ -1660,7 +1751,7 @@ void blkcg_deactivate_policy(struct gendisk *disk,
mutex_unlock(&q->blkcg_mutex);
if (queue_is_mq(q))
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
@@ -1688,24 +1779,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg *blkcg;
int i, ret;
+ /*
+ * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
+ * without pd_alloc_fn/pd_free_fn can't be activated.
+ */
+ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+ (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+ return -EINVAL;
+
mutex_lock(&blkcg_pol_register_mutex);
mutex_lock(&blkcg_pol_mutex);
/* find an empty slot */
- ret = -ENOSPC;
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (!blkcg_policy[i])
break;
if (i >= BLKCG_MAX_POLS) {
pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
+ ret = -ENOSPC;
goto err_unlock;
}
- /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
- if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
- (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
- goto err_unlock;
-
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
@@ -1716,8 +1810,10 @@ int blkcg_policy_register(struct blkcg_policy *pol)
struct blkcg_policy_data *cpd;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
- if (!cpd)
+ if (!cpd) {
+ ret = -ENOMEM;
goto err_free_cpds;
+ }
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
@@ -1728,12 +1824,15 @@ int blkcg_policy_register(struct blkcg_policy *pol)
mutex_unlock(&blkcg_pol_mutex);
/* everything is in place, add intf files for the new policy */
- if (pol->dfl_cftypes)
+ if (pol->dfl_cftypes == pol->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys,
+ pol->dfl_cftypes));
+ } else {
WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
pol->dfl_cftypes));
- if (pol->legacy_cftypes)
WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
pol->legacy_cftypes));
+ }
mutex_unlock(&blkcg_pol_register_mutex);
return 0;
@@ -2144,18 +2243,19 @@ void blk_cgroup_bio_start(struct bio *bio)
}
u64_stats_update_end_irqrestore(&bis->sync, flags);
- cgroup_rstat_updated(blkcg->css.cgroup, cpu);
+ css_rstat_updated(&blkcg->css, cpu);
put_cpu();
}
bool blk_cgroup_congested(void)
{
- struct cgroup_subsys_state *css;
+ struct blkcg *blkcg;
bool ret = false;
rcu_read_lock();
- for (css = blkcg_css(); css; css = css->parent) {
- if (atomic_read(&css->cgroup->congestion_count)) {
+ for (blkcg = css_to_blkcg(blkcg_css()); blkcg;
+ blkcg = blkcg_parent(blkcg)) {
+ if (atomic_read(&blkcg->congestion_count)) {
ret = true;
break;
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 90b3959d88cf..81868ad86330 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -95,6 +95,8 @@ struct blkcg {
struct cgroup_subsys_state css;
spinlock_t lock;
refcount_t online_pin;
+ /* If there is block congestion on this cgroup. */
+ atomic_t congestion_count;
struct radix_tree_root blkg_tree;
struct blkcg_gq __rcu *blkg_hint;
@@ -217,13 +219,17 @@ struct blkg_conf_ctx {
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
+unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
+void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
- * @return: true if this bio needs to be submitted with the root blkg context.
+ * @bio: the target &bio
+ *
+ * Return: true if this bio needs to be submitted with the root blkg context.
*
* In order to avoid priority inversions we sometimes need to issue a bio as if
* it were attached to the root blkg, and then backcharge to the actual owning
@@ -243,7 +249,7 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio)
* @q: request_queue of interest
*
* Lookup blkg for the @blkcg - @q pair.
-
+ *
* Must be called in a RCU critical section.
*/
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
@@ -266,7 +272,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
}
/**
- * blkg_to_pdata - get policy private data
+ * blkg_to_pd - get policy private data
* @blkg: blkg of interest
* @pol: policy of interest
*
@@ -285,7 +291,7 @@ static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
}
/**
- * pdata_to_blkg - get blkg associated with policy private data
+ * pd_to_blkg - get blkg associated with policy private data
* @pd: policy private data of interest
*
* @pd is policy private data. Determine the blkg it's associated with.
@@ -301,19 +307,6 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
}
/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
- return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-}
-
-/**
* blkg_get - get a blkg reference
* @blkg: blkg to get
*
@@ -387,7 +380,7 @@ static inline void blkcg_use_delay(struct blkcg_gq *blkg)
if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
return;
if (atomic_add_return(1, &blkg->use_delay) == 1)
- atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_inc(&blkg->blkcg->congestion_count);
}
static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
@@ -412,7 +405,7 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
if (old == 0)
return 0;
if (old == 1)
- atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_dec(&blkg->blkcg->congestion_count);
return 1;
}
@@ -431,7 +424,7 @@ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
/* We only want 1 person setting the congestion count for this blkg. */
if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
- atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_inc(&blkg->blkcg->congestion_count);
atomic64_set(&blkg->delay_nsec, delay);
}
@@ -448,7 +441,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
/* We only want 1 person clearing the congestion count for this blkg. */
if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
- atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ atomic_dec(&blkg->blkcg->congestion_count);
}
/**
@@ -496,7 +489,6 @@ static inline void blkcg_deactivate_policy(struct gendisk *disk,
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
diff --git a/block/blk-core.c b/block/blk-core.c
index b795ac177281..b862c66018f2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -94,20 +94,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
}
EXPORT_SYMBOL(blk_queue_flag_clear);
-/**
- * blk_queue_flag_test_and_set - atomically test and set a queue flag
- * @flag: flag to be set
- * @q: request queue
- *
- * Returns the previous value of @flag - 0 if the flag was not set and 1 if
- * the flag was already set.
- */
-bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
-{
- return test_and_set_bit(flag, &q->queue_flags);
-}
-EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
-
#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
REQ_OP_NAME(READ),
@@ -174,6 +160,8 @@ static const struct {
/* Command duration limit device-side timeout */
[BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" },
+ [BLK_STS_INVAL] = { -EINVAL, "invalid" },
+
/* everything else not covered above: */
[BLK_STS_IOERR] = { -EIO, "I/O" },
};
@@ -231,7 +219,7 @@ EXPORT_SYMBOL_GPL(blk_status_to_str);
*/
void blk_sync_queue(struct request_queue *q)
{
- del_timer_sync(&q->timeout);
+ timer_delete_sync(&q->timeout);
cancel_work_sync(&q->timeout_work);
}
EXPORT_SYMBOL(blk_sync_queue);
@@ -273,6 +261,8 @@ static void blk_free_queue(struct request_queue *q)
blk_mq_release(q);
ida_free(&blk_queue_ida, q->id);
+ lockdep_unregister_key(&q->io_lock_cls_key);
+ lockdep_unregister_key(&q->q_lock_cls_key);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
@@ -290,18 +280,20 @@ void blk_put_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_put_queue);
-void blk_queue_start_drain(struct request_queue *q)
+bool blk_queue_start_drain(struct request_queue *q)
{
/*
* When queue DYING flag is set, we need to block new req
* entering queue, so we call blk_freeze_queue_start() to
* prevent I/O from crossing blk_queue_enter().
*/
- blk_freeze_queue_start(q);
+ bool freeze = __blk_freeze_queue_start(q, current);
if (queue_is_mq(q))
blk_mq_wake_waiters(q);
/* Make blk_queue_enter() reexamine the DYING flag. */
wake_up_all(&q->mq_freeze_wq);
+
+ return freeze;
}
/**
@@ -333,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
return -ENODEV;
}
+ rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->q_lockdep_map, _RET_IP_);
return 0;
}
@@ -364,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio)
goto dead;
}
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
return 0;
dead:
bio_io_error(bio);
@@ -433,8 +429,8 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
refcount_set(&q->refs, 1);
mutex_init(&q->debugfs_mutex);
+ mutex_init(&q->elevator_lock);
mutex_init(&q->sysfs_lock);
- mutex_init(&q->sysfs_dir_lock);
mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
@@ -453,6 +449,18 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
if (error)
goto fail_stats;
+ lockdep_register_key(&q->io_lock_cls_key);
+ lockdep_register_key(&q->q_lock_cls_key);
+ lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)",
+ &q->io_lock_cls_key, 0);
+ lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
+ &q->q_lock_cls_key, 0);
+
+ /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */
+ fs_reclaim_acquire(GFP_KERNEL);
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
+ fs_reclaim_release(GFP_KERNEL);
q->nr_requests = BLKDEV_DEFAULT_RQ;
@@ -496,7 +504,8 @@ __setup("fail_make_request=", setup_fail_make_request);
bool should_fail_request(struct block_device *part, unsigned int bytes)
{
- return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
+ return bdev_test_flag(part, BD_MAKE_IT_FAIL) &&
+ should_fail(&fail_make_request, bytes);
}
static int __init fail_make_request_debugfs(void)
@@ -516,10 +525,11 @@ static inline void bio_check_ro(struct bio *bio)
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return;
- if (bio->bi_bdev->bd_ro_warned)
+ if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED))
return;
- bio->bi_bdev->bd_ro_warned = true;
+ bdev_set_flag(bio->bi_bdev, BD_RO_WARNED);
+
/*
* Use ioctl to set underlying disk of raid/dm to read-only
* will trigger this.
@@ -591,8 +601,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
- !bio_zone_is_seq(bio))
+ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
return BLK_STS_IOERR;
/*
@@ -614,17 +623,30 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
static void __submit_bio(struct bio *bio)
{
+ /* If plug is not used, add new plug here to cache nsecs time. */
+ struct blk_plug plug;
+
if (unlikely(!blk_crypto_bio_prep(&bio)))
return;
- if (!bio->bi_bdev->bd_has_submit_bio) {
+ blk_start_plug(&plug);
+
+ if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
blk_mq_submit_bio(bio);
} else if (likely(bio_queue_enter(bio) == 0)) {
struct gendisk *disk = bio->bi_bdev->bd_disk;
-
- disk->fops->submit_bio(bio);
+
+ if ((bio->bi_opf & REQ_POLLED) &&
+ !(disk->queue->limits.features & BLK_FEAT_POLL)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ } else {
+ disk->fops->submit_bio(bio);
+ }
blk_queue_exit(disk->queue);
}
+
+ blk_finish_plug(&plug);
}
/*
@@ -725,12 +747,24 @@ void submit_bio_noacct_nocheck(struct bio *bio)
*/
if (current->bio_list)
bio_list_add(&current->bio_list[0], bio);
- else if (!bio->bi_bdev->bd_has_submit_bio)
+ else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
__submit_bio_noacct_mq(bio);
else
__submit_bio_noacct(bio);
}
+static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
+ struct bio *bio)
+{
+ if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q))
+ return BLK_STS_INVAL;
+
+ if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q))
+ return BLK_STS_INVAL;
+
+ return BLK_STS_OK;
+}
+
/**
* submit_bio_noacct - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
@@ -761,7 +795,8 @@ void submit_bio_noacct(struct bio *bio)
if (!bio_flagged(bio, BIO_REMAPPED)) {
if (unlikely(bio_check_eod(bio)))
goto end_io;
- if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+ if (bdev_is_partition(bdev) &&
+ unlikely(blk_partition_remap(bio)))
goto end_io;
}
@@ -773,7 +808,7 @@ void submit_bio_noacct(struct bio *bio)
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
bio_op(bio) != REQ_OP_ZONE_APPEND))
goto end_io;
- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+ if (!bdev_write_cache(bdev)) {
bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
if (!bio_sectors(bio)) {
status = BLK_STS_OK;
@@ -782,12 +817,15 @@ void submit_bio_noacct(struct bio *bio)
}
}
- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- bio_clear_polled(bio);
-
switch (bio_op(bio)) {
case REQ_OP_READ:
+ break;
case REQ_OP_WRITE:
+ if (bio->bi_opf & REQ_ATOMIC) {
+ status = blk_validate_atomic_write_op_size(q, bio);
+ if (status != BLK_STS_OK)
+ goto end_io;
+ }
break;
case REQ_OP_FLUSH:
/*
@@ -816,11 +854,8 @@ void submit_bio_noacct(struct bio *bio)
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
case REQ_OP_ZONE_FINISH:
- if (!bdev_is_zoned(bio->bi_bdev))
- goto not_supported;
- break;
case REQ_OP_ZONE_RESET_ALL:
- if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
+ if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
case REQ_OP_DRV_IN:
@@ -906,16 +941,9 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
return 0;
q = bdev_get_queue(bdev);
- if (cookie == BLK_QC_T_NONE ||
- !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ if (cookie == BLK_QC_T_NONE)
return 0;
- /*
- * As the requests that require a zone lock are not plugged in the
- * first place, directly accessing the plug instead of using
- * blk_mq_plug() should not have any consequences during flushing for
- * zoned devices.
- */
blk_flush_plug(current->plug, false);
/*
@@ -934,7 +962,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
} else {
struct gendisk *disk = q->disk;
- if (disk && disk->fops->poll_bio)
+ if ((q->limits.features & BLK_FEAT_POLL) && disk &&
+ disk->fops->poll_bio)
ret = disk->fops->poll_bio(bio, iob, flags);
}
blk_queue_exit(q);
@@ -987,11 +1016,12 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
unsigned long stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
- if (unlikely(time_after(now, stamp))) {
- if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
- __part_stat_add(part, io_ticks, end ? now - stamp : 1);
- }
- if (part->bd_partno) {
+ if (unlikely(time_after(now, stamp)) &&
+ likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+ (end || bdev_count_inflight(part)))
+ __part_stat_add(part, io_ticks, now - stamp);
+
+ if (bdev_is_partition(part)) {
part = bdev_whole(part);
goto again;
}
@@ -1097,8 +1127,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
return;
plug->cur_ktime = 0;
- plug->mq_list = NULL;
- plug->cached_rq = NULL;
+ rq_list_init(&plug->mq_list);
+ rq_list_init(&plug->cached_rqs);
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
plug->rq_count = 0;
plug->multiple_queues = false;
@@ -1194,7 +1224,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
* queue for cached requests, we don't want a blocked task holding
* up a queue freeze/quiesce event.
*/
- if (unlikely(!rq_list_empty(plug->cached_rq)))
+ if (unlikely(!rq_list_empty(&plug->cached_rqs)))
blk_mq_free_plug_rqs(plug);
plug->cur_ktime = 0;
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index b1e7415f8439..005c9157ffb3 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -87,7 +87,7 @@ static struct bio_set crypto_bio_split;
* This is the key we set when evicting a keyslot. This *should* be the all 0's
* key, but AES-XTS rejects that key, so we use some random bytes instead.
*/
-static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
+static u8 blank_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE];
static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
{
@@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
blk_crypto_fallback_evict_keyslot(slot);
slotp->crypto_mode = crypto_mode;
- err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
+ err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes,
key->size);
if (err) {
blk_crypto_fallback_evict_keyslot(slot);
@@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
@@ -226,7 +227,7 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
split_bio = bio_split(bio, num_sectors, GFP_NOIO,
&crypto_bio_split);
- if (!split_bio) {
+ if (IS_ERR(split_bio)) {
bio->bi_status = BLK_STS_RESOURCE;
return false;
}
@@ -539,7 +540,7 @@ static int blk_crypto_fallback_init(void)
if (blk_crypto_fallback_inited)
return 0;
- get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
+ get_random_bytes(blank_key, sizeof(blank_key));
err = bioset_init(&crypto_bio_split, 64, 0, 0);
if (err)
@@ -561,6 +562,7 @@ static int blk_crypto_fallback_init(void)
blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops;
blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+ blk_crypto_fallback_profile->key_types_supported = BLK_CRYPTO_KEY_TYPE_RAW;
/* All blk-crypto modes have a crypto API fallback. */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 93a141979694..ccf6dff6ff6b 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -14,6 +14,7 @@ struct blk_crypto_mode {
const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
+ unsigned int security_strength; /* security strength in bytes */
unsigned int ivsize; /* iv size in bytes */
};
@@ -82,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
const struct blk_crypto_config *cfg);
+int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp);
+
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
static inline int blk_crypto_sysfs_register(struct gendisk *disk)
@@ -129,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
return false;
}
+static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp)
+{
+ return -ENOTTY;
+}
+
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 7fabc883e39f..94a155912bf1 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
return false;
if (profile->max_dun_bytes_supported < cfg->dun_bytes)
return false;
+ if (!(profile->key_types_supported & cfg->key_type))
+ return false;
return true;
}
@@ -463,6 +465,99 @@ bool blk_crypto_register(struct blk_crypto_profile *profile,
EXPORT_SYMBOL_GPL(blk_crypto_register);
/**
+ * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key
+ * @bdev: a block device that supports hardware-wrapped keys
+ * @eph_key: a hardware-wrapped key in ephemerally-wrapped form
+ * @eph_key_size: size of @eph_key in bytes
+ * @sw_secret: (output) the software secret
+ *
+ * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that
+ * it is used for I/O), ask the hardware to derive the secret which software can
+ * use for cryptographic tasks other than inline encryption. This secret is
+ * guaranteed to be cryptographically isolated from the inline encryption key,
+ * i.e. derived with a different KDF context.
+ *
+ * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support
+ * hardware-wrapped keys, -EBADMSG if the key isn't a valid
+ * ephemerally-wrapped key, or another -errno code.
+ */
+int blk_crypto_derive_sw_secret(struct block_device *bdev,
+ const u8 *eph_key, size_t eph_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+ int err;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.derive_sw_secret)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size,
+ sw_secret);
+ blk_crypto_hw_exit(profile);
+ return err;
+}
+
+int blk_crypto_import_key(struct blk_crypto_profile *profile,
+ const u8 *raw_key, size_t raw_key_size,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.import_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size,
+ lt_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+
+int blk_crypto_generate_key(struct blk_crypto_profile *profile,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.generate_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.generate_key(profile, lt_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+
+int blk_crypto_prepare_key(struct blk_crypto_profile *profile,
+ const u8 *lt_key, size_t lt_key_size,
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ int ret;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+ if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return -EOPNOTSUPP;
+ if (!profile->ll_ops.prepare_key)
+ return -EOPNOTSUPP;
+ blk_crypto_hw_enter(profile);
+ ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size,
+ eph_key);
+ blk_crypto_hw_exit(profile);
+ return ret;
+}
+
+/**
* blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
* by child device
* @parent: the crypto profile for the parent device
@@ -485,10 +580,12 @@ void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
child->max_dun_bytes_supported);
for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++)
parent->modes_supported[i] &= child->modes_supported[i];
+ parent->key_types_supported &= child->key_types_supported;
} else {
parent->max_dun_bytes_supported = 0;
memset(parent->modes_supported, 0,
sizeof(parent->modes_supported));
+ parent->key_types_supported = 0;
}
}
EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities);
@@ -521,6 +618,9 @@ bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
target->max_dun_bytes_supported)
return false;
+ if (reference->key_types_supported & ~target->key_types_supported)
+ return false;
+
return true;
}
EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities);
@@ -555,5 +655,6 @@ void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
sizeof(dst->modes_supported));
dst->max_dun_bytes_supported = src->max_dun_bytes_supported;
+ dst->key_types_supported = src->key_types_supported;
}
EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities);
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
index a304434489ba..e832f403f200 100644
--- a/block/blk-crypto-sysfs.c
+++ b/block/blk-crypto-sysfs.c
@@ -31,6 +31,13 @@ static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
return container_of(attr, struct blk_crypto_attr, attr);
}
+static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ /* Always show supported, since the file doesn't exist otherwise. */
+ return sysfs_emit(page, "supported\n");
+}
+
static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
{
@@ -43,20 +50,48 @@ static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
return sysfs_emit(page, "%u\n", profile->num_slots);
}
+static ssize_t raw_keys_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ /* Always show supported, since the file doesn't exist otherwise. */
+ return sysfs_emit(page, "supported\n");
+}
+
#define BLK_CRYPTO_RO_ATTR(_name) \
static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+BLK_CRYPTO_RO_ATTR(hw_wrapped_keys);
BLK_CRYPTO_RO_ATTR(max_dun_bits);
BLK_CRYPTO_RO_ATTR(num_keyslots);
+BLK_CRYPTO_RO_ATTR(raw_keys);
+
+static umode_t blk_crypto_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ if (a == &hw_wrapped_keys_attr &&
+ !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+ return 0;
+ if (a == &raw_keys_attr &&
+ !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_RAW))
+ return 0;
+
+ return 0444;
+}
static struct attribute *blk_crypto_attrs[] = {
+ &hw_wrapped_keys_attr.attr,
&max_dun_bits_attr.attr,
&num_keyslots_attr.attr,
+ &raw_keys_attr.attr,
NULL,
};
static const struct attribute_group blk_crypto_attr_group = {
.attrs = blk_crypto_attrs,
+ .is_visible = blk_crypto_is_visible,
};
/*
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 4d760b092deb..4b1ad84d1b5a 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -23,24 +23,28 @@ const struct blk_crypto_mode blk_crypto_modes[] = {
.name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
+ .security_strength = 32,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
.name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
+ .security_strength = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
.name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
+ .security_strength = 32,
.ivsize = 32,
},
[BLK_ENCRYPTION_MODE_SM4_XTS] = {
.name = "SM4-XTS",
.cipher_str = "xts(sm4)",
.keysize = 32,
+ .security_strength = 16,
.ivsize = 16,
},
};
@@ -76,9 +80,15 @@ static int __init bio_crypt_ctx_init(void)
/* This is assumed in various places. */
BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
- /* Sanity check that no algorithm exceeds the defined limits. */
+ /*
+ * Validate the crypto mode properties. This ideally would be done with
+ * static assertions, but boot-time checks are the next best thing.
+ */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
- BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].keysize >
+ BLK_CRYPTO_MAX_RAW_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].security_strength >
+ blk_crypto_modes[i].keysize);
BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE);
}
@@ -315,17 +325,20 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
/**
* blk_crypto_init_key() - Prepare a key for use with blk-crypto
* @blk_key: Pointer to the blk_crypto_key to initialize.
- * @raw_key: Pointer to the raw key. Must be the correct length for the chosen
- * @crypto_mode; see blk_crypto_modes[].
+ * @key_bytes: the bytes of the key
+ * @key_size: size of the key in bytes
+ * @key_type: type of the key -- either raw or hardware-wrapped
* @crypto_mode: identifier for the encryption algorithm to use
* @dun_bytes: number of bytes that will be used to specify the DUN when this
* key is used
* @data_unit_size: the data unit size to use for en/decryption
*
* Return: 0 on success, -errno on failure. The caller is responsible for
- * zeroizing both blk_key and raw_key when done with them.
+ * zeroizing both blk_key and key_bytes when done with them.
*/
-int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
+int blk_crypto_init_key(struct blk_crypto_key *blk_key,
+ const u8 *key_bytes, size_t key_size,
+ enum blk_crypto_key_type key_type,
enum blk_crypto_mode_num crypto_mode,
unsigned int dun_bytes,
unsigned int data_unit_size)
@@ -338,8 +351,19 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
return -EINVAL;
mode = &blk_crypto_modes[crypto_mode];
- if (mode->keysize == 0)
+ switch (key_type) {
+ case BLK_CRYPTO_KEY_TYPE_RAW:
+ if (key_size != mode->keysize)
+ return -EINVAL;
+ break;
+ case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED:
+ if (key_size < mode->security_strength ||
+ key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE)
+ return -EINVAL;
+ break;
+ default:
return -EINVAL;
+ }
if (dun_bytes == 0 || dun_bytes > mode->ivsize)
return -EINVAL;
@@ -350,9 +374,10 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
blk_key->crypto_cfg.crypto_mode = crypto_mode;
blk_key->crypto_cfg.dun_bytes = dun_bytes;
blk_key->crypto_cfg.data_unit_size = data_unit_size;
+ blk_key->crypto_cfg.key_type = key_type;
blk_key->data_unit_size_bits = ilog2(data_unit_size);
- blk_key->size = mode->keysize;
- memcpy(blk_key->raw, raw_key, mode->keysize);
+ blk_key->size = key_size;
+ memcpy(blk_key->bytes, key_bytes, key_size);
return 0;
}
@@ -372,8 +397,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev,
bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
- return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- blk_crypto_config_supported_natively(bdev, cfg);
+ if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) &&
+ cfg->key_type == BLK_CRYPTO_KEY_TYPE_RAW)
+ return true;
+ return blk_crypto_config_supported_natively(bdev, cfg);
}
/**
@@ -387,15 +414,21 @@ bool blk_crypto_config_supported(struct block_device *bdev,
* an skcipher, and *should not* be called from the data path, since that might
* cause a deadlock
*
- * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and
- * blk-crypto-fallback is either disabled or the needed algorithm
- * is disabled in the crypto API; or another -errno code.
+ * Return: 0 on success; -EOPNOTSUPP if the key is wrapped but the hardware does
+ * not support wrapped keys; -ENOPKG if the key is a raw key but the
+ * hardware does not support raw keys and blk-crypto-fallback is either
+ * disabled or the needed algorithm is disabled in the crypto API; or
+ * another -errno code if something else went wrong.
*/
int blk_crypto_start_using_key(struct block_device *bdev,
const struct blk_crypto_key *key)
{
if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return 0;
+ if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) {
+ pr_warn_ratelimited("%pg: no support for wrapped keys\n", bdev);
+ return -EOPNOTSUPP;
+ }
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@@ -436,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev,
pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err);
}
EXPORT_SYMBOL_GPL(blk_crypto_evict_key);
+
+static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_import_key_arg arg;
+ u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE];
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key))
+ return -EINVAL;
+
+ if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr),
+ arg.raw_key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.lt_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.lt_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
+ arg.lt_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(raw_key, sizeof(raw_key));
+ memzero_explicit(lt_key, sizeof(lt_key));
+ return ret;
+}
+
+static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_generate_key_arg arg;
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ ret = blk_crypto_generate_key(profile, lt_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.lt_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.lt_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key,
+ arg.lt_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(lt_key, sizeof(lt_key));
+ return ret;
+}
+
+static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile,
+ void __user *argp)
+{
+ struct blk_crypto_prepare_key_arg arg;
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE];
+ int ret;
+
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+
+ if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved)))
+ return -EINVAL;
+
+ if (arg.lt_key_size > sizeof(lt_key))
+ return -EINVAL;
+
+ if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr),
+ arg.lt_key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key);
+ if (ret < 0)
+ goto out;
+ if (ret > arg.eph_key_size) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ arg.eph_key_size = ret;
+ if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key,
+ arg.eph_key_size) ||
+ copy_to_user(argp, &arg, sizeof(arg))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ memzero_explicit(lt_key, sizeof(lt_key));
+ memzero_explicit(eph_key, sizeof(eph_key));
+ return ret;
+}
+
+int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
+ void __user *argp)
+{
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+
+ if (!profile)
+ return -EOPNOTSUPP;
+
+ switch (cmd) {
+ case BLKCRYPTOIMPORTKEY:
+ return blk_crypto_ioctl_import_key(profile, argp);
+ case BLKCRYPTOGENERATEKEY:
+ return blk_crypto_ioctl_generate_key(profile, argp);
+ case BLKCRYPTOPREPAREKEY:
+ return blk_crypto_ioctl_prepare_key(profile, argp);
+ default:
+ return -ENOTTY;
+ }
+}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b0f314f4bc14..43d6152897a4 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,26 +95,9 @@ static void blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq, blk_opf_t flags);
static inline struct blk_flush_queue *
-blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
+blk_get_flush_queue(struct blk_mq_ctx *ctx)
{
- return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
-}
-
-static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
-{
- unsigned int policy = 0;
-
- if (blk_rq_sectors(rq))
- policy |= REQ_FSEQ_DATA;
-
- if (fflags & (1UL << QUEUE_FLAG_WC)) {
- if (rq->cmd_flags & REQ_PREFLUSH)
- policy |= REQ_FSEQ_PREFLUSH;
- if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
- (rq->cmd_flags & REQ_FUA))
- policy |= REQ_FSEQ_POSTFLUSH;
- }
- return policy;
+ return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq;
}
static unsigned int blk_flush_cur_seq(struct request *rq)
@@ -130,6 +113,8 @@ static void blk_flush_restore_request(struct request *rq)
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
+ if (rq->bio)
+ rq->__sector = rq->bio->bi_iter.bi_sector;
/* make @rq a normal request */
rq->rq_flags &= ~RQF_FLUSH_SEQ;
@@ -183,7 +168,7 @@ static void blk_flush_complete_seq(struct request *rq,
/* queue for flush */
if (list_empty(pending))
fq->flush_pending_since = jiffies;
- list_move_tail(&rq->queuelist, pending);
+ list_add_tail(&rq->queuelist, pending);
break;
case REQ_FSEQ_DATA:
@@ -220,7 +205,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
struct list_head *running;
struct request *rq, *n;
unsigned long flags = 0;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx);
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
@@ -261,6 +246,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+ list_del_init(&rq->queuelist);
blk_flush_complete_seq(rq, fq, seq, error);
}
@@ -355,7 +341,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
struct blk_mq_ctx *ctx = rq->mq_ctx;
unsigned long flags;
- struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(ctx);
if (q->elevator) {
WARN_ON(rq->tag < 0);
@@ -396,19 +382,32 @@ static void blk_rq_init_flush(struct request *rq)
bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
- unsigned long fflags = q->queue_flags; /* may change, cache */
- unsigned int policy = blk_flush_policy(fflags, rq);
- struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
+ struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx);
+ bool supports_fua = q->limits.features & BLK_FEAT_FUA;
+ unsigned int policy = 0;
/* FLUSH/FUA request must never be merged */
WARN_ON_ONCE(rq->bio != rq->biotail);
+ if (blk_rq_sectors(rq))
+ policy |= REQ_FSEQ_DATA;
+
+ /*
+ * Check which flushes we need to sequence for this operation.
+ */
+ if (blk_queue_write_cache(q)) {
+ if (rq->cmd_flags & REQ_PREFLUSH)
+ policy |= REQ_FSEQ_PREFLUSH;
+ if ((rq->cmd_flags & REQ_FUA) && !supports_fua)
+ policy |= REQ_FSEQ_POSTFLUSH;
+ }
+
/*
* @policy now records what operations need to be done. Adjust
* REQ_PREFLUSH and FUA for the driver.
*/
rq->cmd_flags &= ~REQ_PREFLUSH;
- if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
+ if (!supports_fua)
rq->cmd_flags &= ~REQ_FUA;
/*
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index c9eb4241e048..d479f5481b66 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -111,7 +111,6 @@ int disk_register_independent_access_ranges(struct gendisk *disk)
struct request_queue *q = disk->queue;
int i, ret;
- lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
if (!iars)
@@ -155,7 +154,6 @@ void disk_unregister_independent_access_ranges(struct gendisk *disk)
struct blk_independent_access_ranges *iars = disk->ia_ranges;
int i;
- lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
if (!iars)
@@ -289,7 +287,6 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
{
struct request_queue *q = disk->queue;
- mutex_lock(&q->sysfs_dir_lock);
mutex_lock(&q->sysfs_lock);
if (iars && !disk_check_ia_ranges(disk, iars)) {
kfree(iars);
@@ -313,6 +310,5 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
disk_register_independent_access_ranges(disk);
unlock:
mutex_unlock(&q->sysfs_lock);
- mutex_unlock(&q->sysfs_dir_lock);
}
EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index ccbeb6dfa87a..a1678f0a9f81 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -53,29 +53,28 @@ new_segment:
return segments;
}
-EXPORT_SYMBOL(blk_rq_count_integrity_sg);
/**
* blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @q: request queue
- * @bio: bio with integrity metadata attached
+ * @rq: request to map
* @sglist: target scatterlist
*
* Description: Map the integrity vectors in request into a
* scatterlist. The scatterlist must be big enough to hold all
- * elements. I.e. sized using blk_rq_count_integrity_sg().
+ * elements. I.e. sized using blk_rq_count_integrity_sg() or
+ * rq->nr_integrity_segments.
*/
-int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
- struct scatterlist *sglist)
+int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
{
struct bio_vec iv, ivprv = { NULL };
+ struct request_queue *q = rq->q;
struct scatterlist *sg = NULL;
+ struct bio *bio = rq->bio;
unsigned int segments = 0;
struct bvec_iter iter;
int prev = 0;
bio_for_each_integrity_vec(iv, bio, iter) {
-
if (prev) {
if (!biovec_phys_mergeable(q, &ivprv, &iv))
goto new_segment;
@@ -103,63 +102,37 @@ new_segment:
if (sg)
sg_mark_end(sg);
+ /*
+ * Something must have been wrong if the figured number of segment
+ * is bigger than number of req's physical integrity segments
+ */
+ BUG_ON(segments > rq->nr_integrity_segments);
+ BUG_ON(segments > queue_max_integrity_segments(q));
return segments;
}
EXPORT_SYMBOL(blk_rq_map_integrity_sg);
-/**
- * blk_integrity_compare - Compare integrity profile of two disks
- * @gd1: Disk to compare
- * @gd2: Disk to compare
- *
- * Description: Meta-devices like DM and MD need to verify that all
- * sub-devices use the same integrity format before advertising to
- * upper layers that they can send/receive integrity metadata. This
- * function can be used to check whether two gendisk devices have
- * compatible integrity formats.
- */
-int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
+int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
+ ssize_t bytes)
{
- struct blk_integrity *b1 = &gd1->queue->integrity;
- struct blk_integrity *b2 = &gd2->queue->integrity;
-
- if (!b1->profile && !b2->profile)
- return 0;
-
- if (!b1->profile || !b2->profile)
- return -1;
-
- if (b1->interval_exp != b2->interval_exp) {
- pr_err("%s: %s/%s protection interval %u != %u\n",
- __func__, gd1->disk_name, gd2->disk_name,
- 1 << b1->interval_exp, 1 << b2->interval_exp);
- return -1;
- }
-
- if (b1->tuple_size != b2->tuple_size) {
- pr_err("%s: %s/%s tuple sz %u != %u\n", __func__,
- gd1->disk_name, gd2->disk_name,
- b1->tuple_size, b2->tuple_size);
- return -1;
- }
-
- if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
- pr_err("%s: %s/%s tag sz %u != %u\n", __func__,
- gd1->disk_name, gd2->disk_name,
- b1->tag_size, b2->tag_size);
- return -1;
- }
-
- if (b1->profile != b2->profile) {
- pr_err("%s: %s/%s type %s != %s\n", __func__,
- gd1->disk_name, gd2->disk_name,
- b1->profile->name, b2->profile->name);
- return -1;
- }
+ int ret;
+ struct iov_iter iter;
+ unsigned int direction;
+ if (op_is_write(req_op(rq)))
+ direction = ITER_DEST;
+ else
+ direction = ITER_SOURCE;
+ iov_iter_ubuf(&iter, direction, ubuf, bytes);
+ ret = bio_integrity_map_user(rq->bio, &iter);
+ if (ret)
+ return ret;
+
+ rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, rq->bio);
+ rq->cmd_flags |= REQ_INTEGRITY;
return 0;
}
-EXPORT_SYMBOL(blk_integrity_compare);
+EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user);
bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
struct request *next)
@@ -188,7 +161,6 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
struct bio *bio)
{
int nr_integrity_segs;
- struct bio *next = bio->bi_next;
if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL)
return true;
@@ -199,22 +171,72 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags)
return false;
- bio->bi_next = NULL;
nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
- bio->bi_next = next;
-
if (req->nr_integrity_segments + nr_integrity_segs >
q->limits.max_integrity_segments)
return false;
- req->nr_integrity_segments += nr_integrity_segs;
-
return true;
}
static inline struct blk_integrity *dev_to_bi(struct device *dev)
{
- return &dev_to_disk(dev)->queue->integrity;
+ return &dev_to_disk(dev)->queue->limits.integrity;
+}
+
+const char *blk_integrity_profile_name(struct blk_integrity *bi)
+{
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_IP:
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ return "T10-DIF-TYPE1-IP";
+ return "T10-DIF-TYPE3-IP";
+ case BLK_INTEGRITY_CSUM_CRC:
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ return "T10-DIF-TYPE1-CRC";
+ return "T10-DIF-TYPE3-CRC";
+ case BLK_INTEGRITY_CSUM_CRC64:
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ return "EXT-DIF-TYPE1-CRC64";
+ return "EXT-DIF-TYPE3-CRC64";
+ case BLK_INTEGRITY_CSUM_NONE:
+ break;
+ }
+
+ return "nop";
+}
+EXPORT_SYMBOL_GPL(blk_integrity_profile_name);
+
+static ssize_t flag_store(struct device *dev, const char *page, size_t count,
+ unsigned char flag)
+{
+ struct request_queue *q = dev_to_disk(dev)->queue;
+ struct queue_limits lim;
+ unsigned long val;
+ int err;
+
+ err = kstrtoul(page, 10, &val);
+ if (err)
+ return err;
+
+ /* note that the flags are inverted vs the values in the sysfs files */
+ lim = queue_limits_start_update(q);
+ if (val)
+ lim.integrity.flags &= ~flag;
+ else
+ lim.integrity.flags |= flag;
+
+ err = queue_limits_commit_update_frozen(q, &lim);
+ if (err)
+ return err;
+ return count;
+}
+
+static ssize_t flag_show(struct device *dev, char *page, unsigned char flag)
+{
+ struct blk_integrity *bi = dev_to_bi(dev);
+
+ return sysfs_emit(page, "%d\n", !(bi->flags & flag));
}
static ssize_t format_show(struct device *dev, struct device_attribute *attr,
@@ -222,9 +244,9 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr,
{
struct blk_integrity *bi = dev_to_bi(dev);
- if (bi->profile && bi->profile->name)
- return sysfs_emit(page, "%s\n", bi->profile->name);
- return sysfs_emit(page, "none\n");
+ if (!bi->tuple_size)
+ return sysfs_emit(page, "none\n");
+ return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi));
}
static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr,
@@ -249,49 +271,26 @@ static ssize_t read_verify_store(struct device *dev,
struct device_attribute *attr,
const char *page, size_t count)
{
- struct blk_integrity *bi = dev_to_bi(dev);
- char *p = (char *) page;
- unsigned long val = simple_strtoul(p, &p, 10);
-
- if (val)
- bi->flags |= BLK_INTEGRITY_VERIFY;
- else
- bi->flags &= ~BLK_INTEGRITY_VERIFY;
-
- return count;
+ return flag_store(dev, page, count, BLK_INTEGRITY_NOVERIFY);
}
static ssize_t read_verify_show(struct device *dev,
struct device_attribute *attr, char *page)
{
- struct blk_integrity *bi = dev_to_bi(dev);
-
- return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY));
+ return flag_show(dev, page, BLK_INTEGRITY_NOVERIFY);
}
static ssize_t write_generate_store(struct device *dev,
struct device_attribute *attr,
const char *page, size_t count)
{
- struct blk_integrity *bi = dev_to_bi(dev);
-
- char *p = (char *) page;
- unsigned long val = simple_strtoul(p, &p, 10);
-
- if (val)
- bi->flags |= BLK_INTEGRITY_GENERATE;
- else
- bi->flags &= ~BLK_INTEGRITY_GENERATE;
-
- return count;
+ return flag_store(dev, page, count, BLK_INTEGRITY_NOGENERATE);
}
static ssize_t write_generate_show(struct device *dev,
struct device_attribute *attr, char *page)
{
- struct blk_integrity *bi = dev_to_bi(dev);
-
- return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE));
+ return flag_show(dev, page, BLK_INTEGRITY_NOGENERATE);
}
static ssize_t device_is_integrity_capable_show(struct device *dev,
@@ -325,81 +324,3 @@ const struct attribute_group blk_integrity_attr_group = {
.name = "integrity",
.attrs = integrity_attrs,
};
-
-static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
-{
- return BLK_STS_OK;
-}
-
-static void blk_integrity_nop_prepare(struct request *rq)
-{
-}
-
-static void blk_integrity_nop_complete(struct request *rq,
- unsigned int nr_bytes)
-{
-}
-
-static const struct blk_integrity_profile nop_profile = {
- .name = "nop",
- .generate_fn = blk_integrity_nop_fn,
- .verify_fn = blk_integrity_nop_fn,
- .prepare_fn = blk_integrity_nop_prepare,
- .complete_fn = blk_integrity_nop_complete,
-};
-
-/**
- * blk_integrity_register - Register a gendisk as being integrity-capable
- * @disk: struct gendisk pointer to make integrity-aware
- * @template: block integrity profile to register
- *
- * Description: When a device needs to advertise itself as being able to
- * send/receive integrity metadata it must use this function to register
- * the capability with the block layer. The template is a blk_integrity
- * struct with values appropriate for the underlying hardware. See
- * Documentation/block/data-integrity.rst.
- */
-void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
-{
- struct blk_integrity *bi = &disk->queue->integrity;
-
- bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE |
- template->flags;
- bi->interval_exp = template->interval_exp ? :
- ilog2(queue_logical_block_size(disk->queue));
- bi->profile = template->profile ? template->profile : &nop_profile;
- bi->tuple_size = template->tuple_size;
- bi->tag_size = template->tag_size;
- bi->pi_offset = template->pi_offset;
-
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
-
-#ifdef CONFIG_BLK_INLINE_ENCRYPTION
- if (disk->queue->crypto_profile) {
- pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
- disk->queue->crypto_profile = NULL;
- }
-#endif
-}
-EXPORT_SYMBOL(blk_integrity_register);
-
-/**
- * blk_integrity_unregister - Unregister block integrity profile
- * @disk: disk whose integrity profile to unregister
- *
- * Description: This function unregisters the integrity capability from
- * a block device.
- */
-void blk_integrity_unregister(struct gendisk *disk)
-{
- struct blk_integrity *bi = &disk->queue->integrity;
-
- if (!bi->profile)
- return;
-
- /* ensure all bios are off the integrity workqueue */
- blk_flush_integrity();
- blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue);
- memset(bi, 0, sizeof(*bi));
-}
-EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 25dd4db11121..ce82770c72ab 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -32,13 +32,6 @@ static void get_io_context(struct io_context *ioc)
atomic_long_inc(&ioc->refcount);
}
-static void icq_free_icq_rcu(struct rcu_head *head)
-{
- struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
-
- kmem_cache_free(icq->__rcu_icq_cache, icq);
-}
-
/*
* Exit an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
@@ -102,7 +95,7 @@ static void ioc_destroy_icq(struct io_cq *icq)
*/
icq->__rcu_icq_cache = et->icq_cache;
icq->flags |= ICQ_DESTROYED;
- call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
+ kfree_rcu(icq, __rcu_head);
}
/*
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 690ca99dfaca..5bfd70311359 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -648,7 +648,7 @@ static const struct ioc_params autop[] = {
* vrate adjust percentages indexed by ioc->busy_level. We adjust up on
* vtime credit shortage and down on device saturation.
*/
-static u32 vrate_adj_pct[] =
+static const u32 vrate_adj_pct[] =
{ 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -1098,7 +1098,14 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
iocg->child_active_sum);
} else {
- inuse = clamp_t(u32, inuse, 1, active);
+ /*
+ * It may be tempting to turn this into a clamp expression with
+ * a lower limit of 1 but active may be 0, which cannot be used
+ * as an upper limit in that situation. This expression allows
+ * active to clamp inuse unless it is 0, in which case inuse
+ * becomes 1.
+ */
+ inuse = min(inuse, active) ?: 1;
}
iocg->last_inuse = iocg->inuse;
@@ -2076,7 +2083,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
struct ioc_now *now)
{
struct ioc_gq *iocg;
- u64 dur, usage_pct, nr_cycles;
+ u64 dur, usage_pct, nr_cycles, nr_cycles_shift;
/* if no debtor, reset the cycle */
if (!nr_debtors) {
@@ -2138,10 +2145,12 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
old_debt = iocg->abs_vdebt;
old_delay = iocg->delay;
+ nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1);
if (iocg->abs_vdebt)
- iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
+ iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1;
+
if (iocg->delay)
- iocg->delay = iocg->delay >> nr_cycles ?: 1;
+ iocg->delay = iocg->delay >> nr_cycles_shift ?: 1;
iocg_kick_waitq(iocg, true, now);
@@ -2709,8 +2718,7 @@ retry_lock:
* All waiters are on iocg->waitq and the wait states are
* synchronized using waitq.lock.
*/
- init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
- wait.wait.private = current;
+ init_wait_func(&wait.wait, iocg_wake_fn);
wait.bio = bio;
wait.abs_cost = abs_cost;
wait.committed = false; /* will be set true by waker */
@@ -2995,8 +3003,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
iocg->hweight_inuse = WEIGHT_ONE;
init_waitqueue_head(&iocg->waitq);
- hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- iocg->waitq_timer.function = iocg_waitq_timer_fn;
+ hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
iocg->level = blkg->blkcg->css.cgroup->level;
@@ -3164,7 +3171,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
if (!dname)
return 0;
- spin_lock_irq(&ioc->lock);
+ spin_lock(&ioc->lock);
seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
ioc->params.qos[QOS_RPPM] / 10000,
@@ -3177,7 +3184,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
ioc->params.qos[QOS_MIN] % 10000 / 100,
ioc->params.qos[QOS_MAX] / 10000,
ioc->params.qos[QOS_MAX] % 10000 / 100);
- spin_unlock_irq(&ioc->lock);
+ spin_unlock(&ioc->lock);
return 0;
}
@@ -3215,13 +3222,16 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *body, *p;
+ unsigned long memflags;
int ret;
blkg_conf_init(&ctx, input);
- ret = blkg_conf_open_bdev(&ctx);
- if (ret)
+ memflags = blkg_conf_open_bdev_frozen(&ctx);
+ if (IS_ERR_VALUE(memflags)) {
+ ret = memflags;
goto err;
+ }
body = ctx.body;
disk = ctx.bdev->bd_disk;
@@ -3238,7 +3248,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(disk->queue);
}
- blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
spin_lock_irq(&ioc->lock);
@@ -3338,19 +3347,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
wbt_enable_default(disk);
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue);
- blkg_conf_exit(&ctx);
+ blkg_conf_exit_frozen(&ctx, memflags);
return nbytes;
einval:
spin_unlock_irq(&ioc->lock);
-
blk_mq_unquiesce_queue(disk->queue);
- blk_mq_unfreeze_queue(disk->queue);
-
ret = -EINVAL;
err:
- blkg_conf_exit(&ctx);
+ blkg_conf_exit_frozen(&ctx, memflags);
return ret;
}
@@ -3364,14 +3369,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf,
if (!dname)
return 0;
- spin_lock_irq(&ioc->lock);
+ spin_lock(&ioc->lock);
seq_printf(sf, "%s ctrl=%s model=linear "
"rbps=%llu rseqiops=%llu rrandiops=%llu "
"wbps=%llu wseqiops=%llu wrandiops=%llu\n",
dname, ioc->user_cost_model ? "user" : "auto",
u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
- spin_unlock_irq(&ioc->lock);
+ spin_unlock(&ioc->lock);
return 0;
}
@@ -3405,6 +3410,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
{
struct blkg_conf_ctx ctx;
struct request_queue *q;
+ unsigned int memflags;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
@@ -3432,7 +3438,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(q);
}
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
spin_lock_irq(&ioc->lock);
@@ -3484,7 +3490,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
blkg_conf_exit(&ctx);
return nbytes;
@@ -3493,7 +3499,7 @@ einval:
spin_unlock_irq(&ioc->lock);
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
ret = -EINVAL;
err:
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index ebb522788d97..42c1e0b9a68f 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -749,9 +749,11 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
*/
enabled = atomic_read(&blkiolat->enable_cnt);
if (enabled != blkiolat->enabled) {
- blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
+ unsigned int memflags;
+
+ memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
blkiolat->enabled = enabled;
- blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue);
+ blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags);
}
}
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 4051fada01f1..13659dc15c3f 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -50,14 +50,6 @@ static const char *policy_name[] = {
static struct blkcg_policy ioprio_policy;
/**
- * struct ioprio_blkg - Per (cgroup, request queue) data.
- * @pd: blkg_policy_data structure.
- */
-struct ioprio_blkg {
- struct blkg_policy_data pd;
-};
-
-/**
* struct ioprio_blkcg - Per cgroup data.
* @cpd: blkcg_policy_data structure.
* @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
@@ -67,11 +59,6 @@ struct ioprio_blkcg {
enum prio_policy prio_policy;
};
-static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
-{
- return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL;
-}
-
static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
{
return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
@@ -84,16 +71,6 @@ ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
}
-static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio)
-{
- struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy);
-
- if (!pd)
- return NULL;
-
- return blkcg_to_ioprio_blkcg(pd->blkg->blkcg);
-}
-
static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));
@@ -118,25 +95,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
return nbytes;
}
-static struct blkg_policy_data *
-ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp)
-{
- struct ioprio_blkg *ioprio_blkg;
-
- ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp);
- if (!ioprio_blkg)
- return NULL;
-
- return &ioprio_blkg->pd;
-}
-
-static void ioprio_free_pd(struct blkg_policy_data *pd)
-{
- struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd);
-
- kfree(ioprio_blkg);
-}
-
static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
{
struct ioprio_blkcg *blkcg;
@@ -155,38 +113,26 @@ static void ioprio_free_cpd(struct blkcg_policy_data *cpd)
kfree(blkcg);
}
-#define IOPRIO_ATTRS \
- { \
- .name = "prio.class", \
- .seq_show = ioprio_show_prio_policy, \
- .write = ioprio_set_prio_policy, \
- }, \
- { } /* sentinel */
-
-/* cgroup v2 attributes */
static struct cftype ioprio_files[] = {
- IOPRIO_ATTRS
-};
-
-/* cgroup v1 attributes */
-static struct cftype ioprio_legacy_files[] = {
- IOPRIO_ATTRS
+ {
+ .name = "prio.class",
+ .seq_show = ioprio_show_prio_policy,
+ .write = ioprio_set_prio_policy,
+ },
+ { } /* sentinel */
};
static struct blkcg_policy ioprio_policy = {
.dfl_cftypes = ioprio_files,
- .legacy_cftypes = ioprio_legacy_files,
+ .legacy_cftypes = ioprio_files,
.cpd_alloc_fn = ioprio_alloc_cpd,
.cpd_free_fn = ioprio_free_cpd,
-
- .pd_alloc_fn = ioprio_alloc_pd,
- .pd_free_fn = ioprio_free_pd,
};
void blkcg_set_ioprio(struct bio *bio)
{
- struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
+ struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg);
u16 prio;
if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
@@ -219,16 +165,6 @@ void blkcg_set_ioprio(struct bio *bio)
bio->bi_ioprio = prio;
}
-void blk_ioprio_exit(struct gendisk *disk)
-{
- blkcg_deactivate_policy(disk, &ioprio_policy);
-}
-
-int blk_ioprio_init(struct gendisk *disk)
-{
- return blkcg_activate_policy(disk, &ioprio_policy);
-}
-
static int __init ioprio_init(void)
{
return blkcg_policy_register(&ioprio_policy);
diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h
index b6afb8e80de0..9265143f9bc9 100644
--- a/block/blk-ioprio.h
+++ b/block/blk-ioprio.h
@@ -9,17 +9,8 @@ struct request_queue;
struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO
-int blk_ioprio_init(struct gendisk *disk);
-void blk_ioprio_exit(struct gendisk *disk);
void blkcg_set_ioprio(struct bio *bio);
#else
-static inline int blk_ioprio_init(struct gendisk *disk)
-{
- return 0;
-}
-static inline void blk_ioprio_exit(struct gendisk *disk)
-{
-}
static inline void blkcg_set_ioprio(struct bio *bio)
{
}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index a6954eafb8c8..4c9f20a689f7 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -35,51 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
-int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+struct bio *blk_alloc_discard_bio(struct block_device *bdev,
+ sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask)
{
- struct bio *bio = *biop;
- sector_t bs_mask;
-
- if (bdev_read_only(bdev))
- return -EPERM;
- if (!bdev_max_discard_sectors(bdev))
- return -EOPNOTSUPP;
-
- /* In case the discard granularity isn't set by buggy device driver */
- if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
- pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
- bdev);
- return -EOPNOTSUPP;
- }
-
- bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- if ((sector | nr_sects) & bs_mask)
- return -EINVAL;
+ sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector));
+ struct bio *bio;
- if (!nr_sects)
- return -EINVAL;
+ if (!bio_sects)
+ return NULL;
- while (nr_sects) {
- sector_t req_sects =
- min(nr_sects, bio_discard_limit(bdev, sector));
+ bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask);
+ if (!bio)
+ return NULL;
+ bio->bi_iter.bi_sector = *sector;
+ bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT;
+ *sector += bio_sects;
+ *nr_sects -= bio_sects;
+ /*
+ * We can loop for a long time in here if someone does full device
+ * discards (like mkfs). Be nice and allow us to schedule out to avoid
+ * softlocking if preempt is disabled.
+ */
+ cond_resched();
+ return bio;
+}
- bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask);
- bio->bi_iter.bi_sector = sector;
- bio->bi_iter.bi_size = req_sects << 9;
- sector += req_sects;
- nr_sects -= req_sects;
-
- /*
- * We can loop for a long time in here, if someone does
- * full device discards (like mkfs). Be nice and allow
- * us to schedule out to avoid softlocking if preempt
- * is disabled.
- */
- cond_resched();
- }
+int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+{
+ struct bio *bio;
- *biop = bio;
+ while ((bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+ gfp_mask)))
+ *biop = bio_chain_and_submit(*biop, bio);
return 0;
}
EXPORT_SYMBOL(__blkdev_issue_discard);
@@ -115,38 +103,80 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);
-static int __blkdev_issue_write_zeroes(struct block_device *bdev,
- sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
- struct bio **biop, unsigned flags)
+static sector_t bio_write_zeroes_limit(struct block_device *bdev)
{
- struct bio *bio = *biop;
- unsigned int max_sectors;
-
- if (bdev_read_only(bdev))
- return -EPERM;
+ sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- /* Ensure that max_sectors doesn't overflow bi_size */
- max_sectors = bdev_write_zeroes_sectors(bdev);
+ return min(bdev_write_zeroes_sectors(bdev),
+ (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask);
+}
- if (max_sectors == 0)
- return -EOPNOTSUPP;
+/*
+ * There is no reliable way for the SCSI subsystem to determine whether a
+ * device supports a WRITE SAME operation without actually performing a write
+ * to media. As a result, write_zeroes is enabled by default and will be
+ * disabled if a zeroing operation subsequently fails. This means that this
+ * queue limit is likely to change at runtime.
+ */
+static void __blkdev_issue_write_zeroes(struct block_device *bdev,
+ sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+ struct bio **biop, unsigned flags, sector_t limit)
+{
while (nr_sects) {
- unsigned int len = min_t(sector_t, nr_sects, max_sectors);
+ unsigned int len = min(nr_sects, limit);
+ struct bio *bio;
+
+ if ((flags & BLKDEV_ZERO_KILLABLE) &&
+ fatal_signal_pending(current))
+ break;
- bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
+ bio = bio_alloc(bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
bio->bi_iter.bi_size = len << SECTOR_SHIFT;
+ *biop = bio_chain_and_submit(*biop, bio);
+
nr_sects -= len;
sector += len;
cond_resched();
}
+}
- *biop = bio;
- return 0;
+static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp, unsigned flags)
+{
+ sector_t limit = bio_write_zeroes_limit(bdev);
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int ret = 0;
+
+ blk_start_plug(&plug);
+ __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio,
+ flags, limit);
+ if (bio) {
+ if ((flags & BLKDEV_ZERO_KILLABLE) &&
+ fatal_signal_pending(current)) {
+ bio_await_chain(bio);
+ blk_finish_plug(&plug);
+ return -EINTR;
+ }
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ }
+ blk_finish_plug(&plug);
+
+ /*
+ * For some devices there is no non-destructive way to verify whether
+ * WRITE ZEROES is actually supported. These will clear the capability
+ * on an I/O error, in which case we'll turn any error into
+ * "not supported" here.
+ */
+ if (ret && !bdev_write_zeroes_sectors(bdev))
+ return -EOPNOTSUPP;
+ return ret;
}
/*
@@ -162,35 +192,63 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
return min(pages, (sector_t)BIO_MAX_VECS);
}
-static int __blkdev_issue_zero_pages(struct block_device *bdev,
+static void __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
- struct bio **biop)
+ struct bio **biop, unsigned int flags)
{
- struct bio *bio = *biop;
- int bi_size = 0;
- unsigned int sz;
-
- if (bdev_read_only(bdev))
- return -EPERM;
+ while (nr_sects) {
+ unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects);
+ struct bio *bio;
- while (nr_sects != 0) {
- bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects),
- REQ_OP_WRITE, gfp_mask);
+ bio = bio_alloc(bdev, nr_vecs, REQ_OP_WRITE, gfp_mask);
bio->bi_iter.bi_sector = sector;
- while (nr_sects != 0) {
- sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
- bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
- nr_sects -= bi_size >> 9;
- sector += bi_size >> 9;
- if (bi_size < sz)
+ if ((flags & BLKDEV_ZERO_KILLABLE) &&
+ fatal_signal_pending(current))
+ break;
+
+ do {
+ unsigned int len, added;
+
+ len = min_t(sector_t,
+ PAGE_SIZE, nr_sects << SECTOR_SHIFT);
+ added = bio_add_page(bio, ZERO_PAGE(0), len, 0);
+ if (added < len)
break;
- }
+ nr_sects -= added >> SECTOR_SHIFT;
+ sector += added >> SECTOR_SHIFT;
+ } while (nr_sects);
+
+ *biop = bio_chain_and_submit(*biop, bio);
cond_resched();
}
+}
- *biop = bio;
- return 0;
+static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp, unsigned flags)
+{
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int ret = 0;
+
+ if (flags & BLKDEV_ZERO_NOFALLBACK)
+ return -EOPNOTSUPP;
+
+ blk_start_plug(&plug);
+ __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags);
+ if (bio) {
+ if ((flags & BLKDEV_ZERO_KILLABLE) &&
+ fatal_signal_pending(current)) {
+ bio_await_chain(bio);
+ blk_finish_plug(&plug);
+ return -EINTR;
+ }
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ }
+ blk_finish_plug(&plug);
+
+ return ret;
}
/**
@@ -216,20 +274,21 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
unsigned flags)
{
- int ret;
- sector_t bs_mask;
+ sector_t limit = bio_write_zeroes_limit(bdev);
- bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- if ((sector | nr_sects) & bs_mask)
- return -EINVAL;
-
- ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
- biop, flags);
- if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
- return ret;
+ if (bdev_read_only(bdev))
+ return -EPERM;
- return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
- biop);
+ if (limit) {
+ __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
+ gfp_mask, biop, flags, limit);
+ } else {
+ if (flags & BLKDEV_ZERO_NOFALLBACK)
+ return -EOPNOTSUPP;
+ __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
+ biop, flags);
+ }
+ return 0;
}
EXPORT_SYMBOL(__blkdev_issue_zeroout);
@@ -249,51 +308,21 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
{
- int ret = 0;
- sector_t bs_mask;
- struct bio *bio;
- struct blk_plug plug;
- bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
+ int ret;
- bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- if ((sector | nr_sects) & bs_mask)
+ if ((sector | nr_sects) & ((bdev_logical_block_size(bdev) >> 9) - 1))
return -EINVAL;
+ if (bdev_read_only(bdev))
+ return -EPERM;
-retry:
- bio = NULL;
- blk_start_plug(&plug);
- if (try_write_zeroes) {
- ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
- gfp_mask, &bio, flags);
- } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
- ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
- gfp_mask, &bio);
- } else {
- /* No zeroing offload support */
- ret = -EOPNOTSUPP;
- }
- if (ret == 0 && bio) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
- }
- blk_finish_plug(&plug);
- if (ret && try_write_zeroes) {
- if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
- try_write_zeroes = false;
- goto retry;
- }
- if (!bdev_write_zeroes_sectors(bdev)) {
- /*
- * Zeroing offload support was indicated, but the
- * device reported ILLEGAL REQUEST (for some devices
- * there is no non-destructive way to verify whether
- * WRITE ZEROES is actually supported).
- */
- ret = -EOPNOTSUPP;
- }
+ if (bdev_write_zeroes_sectors(bdev)) {
+ ret = blkdev_issue_write_zeroes(bdev, sector, nr_sects,
+ gfp_mask, flags);
+ if (ret != -EOPNOTSUPP)
+ return ret;
}
- return ret;
+ return blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, flags);
}
EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index 71210cdb3442..23e5d5ebe59e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -189,7 +189,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
}
}
- if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) {
+ if (bio_add_page(bio, page, bytes, offset) < bytes) {
if (!map_data)
__free_page(page);
break;
@@ -272,86 +272,27 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq,
static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
gfp_t gfp_mask)
{
- iov_iter_extraction_t extraction_flags = 0;
- unsigned int max_sectors = queue_max_hw_sectors(rq->q);
unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
struct bio *bio;
int ret;
- int j;
if (!iov_iter_count(iter))
return -EINVAL;
bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask);
- if (bio == NULL)
+ if (!bio)
return -ENOMEM;
-
- if (blk_queue_pci_p2pdma(rq->q))
- extraction_flags |= ITER_ALLOW_P2PDMA;
- if (iov_iter_extract_will_pin(iter))
- bio_set_flag(bio, BIO_PAGE_PINNED);
-
- while (iov_iter_count(iter)) {
- struct page *stack_pages[UIO_FASTIOV];
- struct page **pages = stack_pages;
- ssize_t bytes;
- size_t offs;
- int npages;
-
- if (nr_vecs > ARRAY_SIZE(stack_pages))
- pages = NULL;
-
- bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX,
- nr_vecs, extraction_flags, &offs);
- if (unlikely(bytes <= 0)) {
- ret = bytes ? bytes : -EFAULT;
- goto out_unmap;
- }
-
- npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
-
- if (unlikely(offs & queue_dma_alignment(rq->q)))
- j = 0;
- else {
- for (j = 0; j < npages; j++) {
- struct page *page = pages[j];
- unsigned int n = PAGE_SIZE - offs;
- bool same_page = false;
-
- if (n > bytes)
- n = bytes;
-
- if (!bio_add_hw_page(rq->q, bio, page, n, offs,
- max_sectors, &same_page))
- break;
-
- if (same_page)
- bio_release_page(bio, page);
- bytes -= n;
- offs = 0;
- }
- }
- /*
- * release the pages we didn't map into the bio, if any
- */
- while (j < npages)
- bio_release_page(bio, pages[j++]);
- if (pages != stack_pages)
- kvfree(pages);
- /* couldn't stuff something into bio? */
- if (bytes) {
- iov_iter_revert(iter, bytes);
- break;
- }
- }
-
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (ret)
+ goto out_put;
ret = blk_rq_append_bio(rq, bio);
if (ret)
- goto out_unmap;
+ goto out_release;
return 0;
- out_unmap:
+out_release:
bio_release_pages(bio, false);
+out_put:
blk_mq_map_bio_put(bio);
return ret;
}
@@ -376,65 +317,26 @@ static void bio_map_kern_endio(struct bio *bio)
kfree(bio);
}
-/**
- * bio_map_kern - map kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to map
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-static struct bio *bio_map_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask)
+static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op,
+ gfp_t gfp_mask)
{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
- bool is_vmalloc = is_vmalloc_addr(data);
- struct page *page;
- int offset, i;
+ unsigned int nr_vecs = bio_add_max_vecs(data, len);
struct bio *bio;
- bio = bio_kmalloc(nr_pages, gfp_mask);
+ bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
-
- if (is_vmalloc) {
- flush_kernel_vmap_range(data, len);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op);
+ if (is_vmalloc_addr(data)) {
bio->bi_private = data;
- }
-
- offset = offset_in_page(kaddr);
- for (i = 0; i < nr_pages; i++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- if (!is_vmalloc)
- page = virt_to_page(data);
- else
- page = vmalloc_to_page(data);
- if (bio_add_pc_page(q, bio, page, bytes,
- offset) < bytes) {
- /* we don't support partial mappings */
+ if (!bio_add_vmalloc(bio, data, len)) {
bio_uninit(bio);
kfree(bio);
return ERR_PTR(-EINVAL);
}
-
- data += bytes;
- len -= bytes;
- offset = 0;
+ } else {
+ bio_add_virt_nofail(bio, data, len);
}
-
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
@@ -462,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio)
/**
* bio_copy_kern - copy kernel address into bio
- * @q: the struct request_queue for the bio
* @data: pointer to buffer to copy
* @len: length in bytes
+ * @op: bio/request operation
* @gfp_mask: allocation flags for bio and page allocation
- * @reading: data direction is READ
*
* copy the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
-static struct bio *bio_copy_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask, int reading)
+static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op,
+ gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -491,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op);
while (len) {
struct page *page;
@@ -504,21 +405,21 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
if (!page)
goto cleanup;
- if (!reading)
+ if (op_is_write(op))
memcpy(page_address(page), p, bytes);
- if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+ if (bio_add_page(bio, page, bytes, 0) < bytes)
break;
len -= bytes;
p += bytes;
}
- if (reading) {
+ if (op_is_write(op)) {
+ bio->bi_end_io = bio_copy_kern_endio;
+ } else {
bio->bi_end_io = bio_copy_kern_endio_read;
bio->bi_private = data;
- } else {
- bio->bi_end_io = bio_copy_kern_endio;
}
return bio;
@@ -536,24 +437,33 @@ cleanup:
*/
int blk_rq_append_bio(struct request *rq, struct bio *bio)
{
- struct bvec_iter iter;
- struct bio_vec bv;
+ const struct queue_limits *lim = &rq->q->limits;
+ unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT;
unsigned int nr_segs = 0;
+ int ret;
- bio_for_each_bvec(bv, bio, iter)
- nr_segs++;
+ /* check that the data layout matches the hardware restrictions */
+ ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes);
+ if (ret) {
+ /* if we would have to split the bio, copy instead */
+ if (ret > 0)
+ ret = -EREMOTEIO;
+ return ret;
+ }
- if (!rq->bio) {
- blk_rq_bio_prep(rq, bio, nr_segs);
- } else {
+ if (rq->bio) {
if (!ll_back_merge_fn(rq, bio, nr_segs))
return -EINVAL;
rq->biotail->bi_next = bio;
rq->biotail = bio;
- rq->__data_len += (bio)->bi_iter.bi_size;
+ rq->__data_len += bio->bi_iter.bi_size;
bio_crypt_free_ctx(bio);
+ return 0;
}
+ rq->nr_phys_segments = nr_segs;
+ rq->bio = rq->biotail = bio;
+ rq->__data_len = bio->bi_iter.bi_size;
return 0;
}
EXPORT_SYMBOL(blk_rq_append_bio);
@@ -561,57 +471,23 @@ EXPORT_SYMBOL(blk_rq_append_bio);
/* Prepare bio for passthrough IO given ITER_BVEC iter */
static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
{
- struct request_queue *q = rq->q;
- size_t nr_iter = iov_iter_count(iter);
- size_t nr_segs = iter->nr_segs;
- struct bio_vec *bvecs, *bvprvp = NULL;
- const struct queue_limits *lim = &q->limits;
- unsigned int nsegs = 0, bytes = 0;
+ unsigned int max_bytes = rq->q->limits.max_hw_sectors << SECTOR_SHIFT;
struct bio *bio;
- size_t i;
+ int ret;
- if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
- return -EINVAL;
- if (nr_segs > queue_max_segments(q))
+ if (!iov_iter_count(iter) || iov_iter_count(iter) > max_bytes)
return -EINVAL;
- /* no iovecs to alloc, as we already have a BVEC iterator */
+ /* reuse the bvecs from the iterator instead of allocating new ones */
bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL);
- if (bio == NULL)
+ if (!bio)
return -ENOMEM;
+ bio_iov_bvec_set(bio, iter);
- bio_iov_bvec_set(bio, (struct iov_iter *)iter);
- blk_rq_bio_prep(rq, bio, nr_segs);
-
- /* loop to perform a bunch of sanity checks */
- bvecs = (struct bio_vec *)iter->bvec;
- for (i = 0; i < nr_segs; i++) {
- struct bio_vec *bv = &bvecs[i];
-
- /*
- * If the queue doesn't support SG gaps and adding this
- * offset would create a gap, fallback to copy.
- */
- if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) {
- blk_mq_map_bio_put(bio);
- return -EREMOTEIO;
- }
- /* check full condition */
- if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
- goto put_bio;
- if (bytes + bv->bv_len > nr_iter)
- goto put_bio;
- if (bv->bv_offset + bv->bv_len > PAGE_SIZE)
- goto put_bio;
-
- nsegs++;
- bytes += bv->bv_len;
- bvprvp = bv;
- }
- return 0;
-put_bio:
- blk_mq_map_bio_put(bio);
- return -EINVAL;
+ ret = blk_rq_append_bio(rq, bio);
+ if (ret)
+ blk_mq_map_bio_put(bio);
+ return ret;
}
/**
@@ -634,15 +510,13 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
const struct iov_iter *iter, gfp_t gfp_mask)
{
bool copy = false, map_bvec = false;
- unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
+ unsigned long align = blk_lim_dma_alignment_and_pad(&q->limits);
struct bio *bio = NULL;
struct iov_iter i;
int ret = -EINVAL;
if (map_data)
copy = true;
- else if (blk_queue_may_bounce(q))
- copy = true;
else if (iov_iter_alignment(iter) & align)
copy = true;
else if (iov_iter_is_bvec(iter))
@@ -668,8 +542,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask);
else
ret = bio_map_user_iov(rq, &i, gfp_mask);
- if (ret)
+ if (ret) {
+ if (ret == -EREMOTEIO)
+ ret = -EINVAL;
goto unmap_rq;
+ }
if (!bio)
bio = rq->bio;
} while (iov_iter_count(&i));
@@ -757,6 +634,9 @@ int blk_rq_unmap_user(struct bio *bio)
bio_release_pages(bio, bio_data_dir(bio) == READ);
}
+ if (bio_integrity(bio))
+ bio_integrity_unmap_user(bio);
+
next_bio = bio;
bio = bio->bi_next;
blk_mq_map_bio_put(next_bio);
@@ -768,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
/**
* blk_rq_map_kern - map kernel data to a request, for passthrough requests
- * @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
* @len: length of user data
@@ -779,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
* buffer is used. Can be called multiple times to append multiple
* buffers.
*/
-int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
- unsigned int len, gfp_t gfp_mask)
+int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
+ gfp_t gfp_mask)
{
- int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
struct bio *bio;
int ret;
- if (len > (queue_max_hw_sectors(q) << 9))
+ if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT))
return -EINVAL;
if (!len || !kbuf)
return -EINVAL;
- if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
- blk_queue_may_bounce(q))
- bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
+ if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf))
+ bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask);
else
- bio = bio_map_kern(q, kbuf, len, gfp_mask);
+ bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
- bio->bi_opf &= ~REQ_OP_MASK;
- bio->bi_opf |= req_op(rq);
-
ret = blk_rq_append_bio(rq, bio);
if (unlikely(ret)) {
bio_uninit(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e3483a16b75..3af1d284add5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -7,7 +7,6 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
-#include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <linux/blk-cgroup.h>
@@ -105,9 +104,38 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
-static struct bio *bio_split_discard(struct bio *bio,
- const struct queue_limits *lim,
- unsigned *nsegs, struct bio_set *bs)
+static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
+{
+ if (unlikely(split_sectors < 0))
+ goto error;
+
+ if (split_sectors) {
+ struct bio *split;
+
+ split = bio_split(bio, split_sectors, GFP_NOIO,
+ &bio->bi_bdev->bd_disk->bio_split);
+ if (IS_ERR(split)) {
+ split_sectors = PTR_ERR(split);
+ goto error;
+ }
+ split->bi_opf |= REQ_NOMERGE;
+ blkcg_bio_issue_init(split);
+ bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
+ submit_bio_noacct(bio);
+ return split;
+ }
+
+ return bio;
+error:
+ bio->bi_status = errno_to_blk_status(split_sectors);
+ bio_endio(bio);
+ return NULL;
+}
+
+struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nsegs)
{
unsigned int max_discard_sectors, granularity;
sector_t tmp;
@@ -121,10 +149,10 @@ static struct bio *bio_split_discard(struct bio *bio,
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors))
- return NULL;
+ return bio;
if (bio_sectors(bio) <= max_discard_sectors)
- return NULL;
+ return bio;
split_sectors = max_discard_sectors;
@@ -139,19 +167,20 @@ static struct bio *bio_split_discard(struct bio *bio,
if (split_sectors > tmp)
split_sectors -= tmp;
- return bio_split(bio, split_sectors, GFP_NOIO, bs);
+ return bio_submit_split(bio, split_sectors);
}
-static struct bio *bio_split_write_zeroes(struct bio *bio,
- const struct queue_limits *lim,
- unsigned *nsegs, struct bio_set *bs)
+static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
+ bool is_atomic)
{
- *nsegs = 0;
- if (!lim->max_write_zeroes_sectors)
- return NULL;
- if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
- return NULL;
- return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
+ /*
+ * chunk_sectors must be a multiple of atomic_write_boundary_sectors if
+ * both non-zero.
+ */
+ if (is_atomic && lim->atomic_write_boundary_sectors)
+ return lim->atomic_write_boundary_sectors;
+
+ return lim->chunk_sectors;
}
/*
@@ -167,12 +196,25 @@ static inline unsigned get_max_io_size(struct bio *bio,
{
unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
- unsigned max_sectors = lim->max_sectors, start, end;
+ bool is_atomic = bio->bi_opf & REQ_ATOMIC;
+ unsigned boundary_sectors = blk_boundary_sectors(lim, is_atomic);
+ unsigned max_sectors, start, end;
+
+ /*
+ * We ignore lim->max_sectors for atomic writes because it may less
+ * than the actual bio size, which we cannot tolerate.
+ */
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
+ max_sectors = lim->max_write_zeroes_sectors;
+ else if (is_atomic)
+ max_sectors = lim->atomic_write_max_sectors;
+ else
+ max_sectors = lim->max_sectors;
- if (lim->chunk_sectors) {
+ if (boundary_sectors) {
max_sectors = min(max_sectors,
- blk_chunk_sectors_left(bio->bi_iter.bi_sector,
- lim->chunk_sectors));
+ blk_boundary_sectors_left(bio->bi_iter.bi_sector,
+ boundary_sectors));
}
start = bio->bi_iter.bi_sector & (pbs - 1);
@@ -183,28 +225,6 @@ static inline unsigned get_max_io_size(struct bio *bio,
}
/**
- * get_max_segment_size() - maximum number of bytes to add as a single segment
- * @lim: Request queue limits.
- * @start_page: See below.
- * @offset: Offset from @start_page where to add a segment.
- *
- * Returns the maximum number of bytes that can be added as a single segment.
- */
-static inline unsigned get_max_segment_size(const struct queue_limits *lim,
- struct page *start_page, unsigned long offset)
-{
- unsigned long mask = lim->seg_boundary_mask;
-
- offset = mask & (page_to_phys(start_page) + offset);
-
- /*
- * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
- * after having calculated the minimum.
- */
- return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1;
-}
-
-/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
* @lim: [in] queue limits to split based on
* @bv: [in] bvec to examine
@@ -228,15 +248,13 @@ static bool bvec_split_segs(const struct queue_limits *lim,
const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
unsigned max_segs, unsigned max_bytes)
{
- unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
+ unsigned max_len = max_bytes - *bytes;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned seg_size = 0;
while (len && *nsegs < max_segs) {
- seg_size = get_max_segment_size(lim, bv->bv_page,
- bv->bv_offset + total_len);
- seg_size = min(seg_size, len);
+ seg_size = get_max_segment_size(lim, bvec_phys(bv) + total_len, len);
(*nsegs)++;
total_len += seg_size;
@@ -252,28 +270,28 @@ static bool bvec_split_segs(const struct queue_limits *lim,
return len > 0 || bv->bv_len > max_len;
}
+static unsigned int bio_split_alignment(struct bio *bio,
+ const struct queue_limits *lim)
+{
+ if (op_is_write(bio_op(bio)) && lim->zone_write_granularity)
+ return lim->zone_write_granularity;
+ return lim->logical_block_size;
+}
+
/**
- * bio_split_rw - split a bio in two bios
+ * bio_split_rw_at - check if and where to split a read/write bio
* @bio: [in] bio to be split
* @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
- * @bs: [in] bio set to allocate the clone from
* @max_bytes: [in] maximum number of bytes per bio
*
- * Clone @bio, update the bi_iter of the clone to represent the first sectors
- * of @bio and update @bio->bi_iter to represent the remaining sectors. The
- * following is guaranteed for the cloned bio:
- * - That it has at most @max_bytes worth of data
- * - That it has at most queue_max_segments(@q) segments.
- *
- * Except for discard requests the cloned bio will point at the bi_io_vec of
- * the original bio. It is the responsibility of the caller to ensure that the
- * original bio is not freed before the cloned bio. The caller is also
- * responsible for ensuring that @bs is only destroyed after processing of the
- * split bio has finished.
+ * Find out if @bio needs to be split to fit the queue limits in @lim and a
+ * maximum size of @max_bytes. Returns a negative error number if @bio can't be
+ * split, 0 if the bio doesn't have to be split, or a positive sector offset if
+ * @bio needs to be split.
*/
-struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
- unsigned *segs, struct bio_set *bs, unsigned max_bytes)
+int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
@@ -289,7 +307,7 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
if (nsegs < lim->max_segments &&
bytes + bv.bv_len <= max_bytes &&
- bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+ bv.bv_offset + bv.bv_len <= lim->min_segment_size) {
nsegs++;
bytes += bv.bv_len;
} else {
@@ -303,17 +321,17 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
}
*segs = nsegs;
- return NULL;
+ return 0;
split:
+ if (bio->bi_opf & REQ_ATOMIC)
+ return -EINVAL;
+
/*
* We can't sanely support splitting for a REQ_NOWAIT bio. End it
* with EAGAIN if splitting is required and return an error pointer.
*/
- if (bio->bi_opf & REQ_NOWAIT) {
- bio->bi_status = BLK_STS_AGAIN;
- bio_endio(bio);
- return ERR_PTR(-EAGAIN);
- }
+ if (bio->bi_opf & REQ_NOWAIT)
+ return -EAGAIN;
*segs = nsegs;
@@ -322,7 +340,7 @@ split:
* split size so that each bio is properly block size aligned, even if
* we do not use the full hardware limits.
*/
- bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+ bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim));
/*
* Bio splitting may cause subtle trouble such as hang when doing sync
@@ -330,57 +348,55 @@ split:
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_polled(bio);
- return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
+ return bytes >> SECTOR_SHIFT;
}
-EXPORT_SYMBOL_GPL(bio_split_rw);
+EXPORT_SYMBOL_GPL(bio_split_rw_at);
-/**
- * __bio_split_to_limits - split a bio to fit the queue limits
- * @bio: bio to be split
- * @lim: queue limits to split based on
- * @nr_segs: returns the number of segments in the returned bio
- *
- * Check if @bio needs splitting based on the queue limits, and if so split off
- * a bio fitting the limits from the beginning of @bio and return it. @bio is
- * shortened to the remainder and re-submitted.
+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nr_segs)
+{
+ return bio_submit_split(bio,
+ bio_split_rw_at(bio, lim, nr_segs,
+ get_max_io_size(bio, lim) << SECTOR_SHIFT));
+}
+
+/*
+ * REQ_OP_ZONE_APPEND bios must never be split by the block layer.
*
- * The split bio is allocated from @q->bio_split, which is provided by the
- * block layer.
+ * But we want the nr_segs calculation provided by bio_split_rw_at, and having
+ * a good sanity check that the submitter built the bio correctly is nice to
+ * have as well.
*/
-struct bio *__bio_split_to_limits(struct bio *bio,
- const struct queue_limits *lim,
- unsigned int *nr_segs)
+struct bio *bio_split_zone_append(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nr_segs)
{
- struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
- struct bio *split;
+ int split_sectors;
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- split = bio_split_discard(bio, lim, nr_segs, bs);
- break;
- case REQ_OP_WRITE_ZEROES:
- split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
- break;
- default:
- split = bio_split_rw(bio, lim, nr_segs, bs,
- get_max_io_size(bio, lim) << SECTOR_SHIFT);
- if (IS_ERR(split))
- return NULL;
- break;
- }
+ split_sectors = bio_split_rw_at(bio, lim, nr_segs,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+ if (WARN_ON_ONCE(split_sectors > 0))
+ split_sectors = -EINVAL;
+ return bio_submit_split(bio, split_sectors);
+}
- if (split) {
- /* there isn't chance to merge the split bio */
- split->bi_opf |= REQ_NOMERGE;
+struct bio *bio_split_write_zeroes(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nsegs)
+{
+ unsigned int max_sectors = get_max_io_size(bio, lim);
- blkcg_bio_issue_init(split);
- bio_chain(split, bio);
- trace_block_split(split, bio->bi_iter.bi_sector);
- submit_bio_noacct(bio);
- return split;
- }
- return bio;
+ *nsegs = 0;
+
+ /*
+ * An unset limit should normally not happen, as bio submission is keyed
+ * off having a non-zero limit. But SCSI can clear the limit in the
+ * I/O completion handler, and we can race and see this. Splitting to a
+ * zero limit obviously doesn't make sense, so band-aid it here.
+ */
+ if (!max_sectors)
+ return bio;
+ if (bio_sectors(bio) <= max_sectors)
+ return bio;
+ return bio_submit_split(bio, max_sectors);
}
/**
@@ -396,12 +412,9 @@ struct bio *__bio_split_to_limits(struct bio *bio,
*/
struct bio *bio_split_to_limits(struct bio *bio)
{
- const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
- if (bio_may_exceed_limits(bio, lim))
- return __bio_split_to_limits(bio, lim, &nr_segs);
- return bio;
+ return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs);
}
EXPORT_SYMBOL(bio_split_to_limits);
@@ -438,167 +451,26 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return nr_phys_segs;
}
-static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
- struct scatterlist *sglist)
-{
- if (!*sg)
- return sglist;
-
- /*
- * If the driver previously mapped a shorter list, we could see a
- * termination bit prematurely unless it fully inits the sg table
- * on each mapping. We KNOW that there must be more entries here
- * or the driver would be buggy, so force clear the termination bit
- * to avoid doing a full sg_init_table() in drivers for each command.
- */
- sg_unmark_end(*sg);
- return sg_next(*sg);
-}
-
-static unsigned blk_bvec_map_sg(struct request_queue *q,
- struct bio_vec *bvec, struct scatterlist *sglist,
- struct scatterlist **sg)
-{
- unsigned nbytes = bvec->bv_len;
- unsigned nsegs = 0, total = 0;
-
- while (nbytes > 0) {
- unsigned offset = bvec->bv_offset + total;
- unsigned len = min(get_max_segment_size(&q->limits,
- bvec->bv_page, offset), nbytes);
- struct page *page = bvec->bv_page;
-
- /*
- * Unfortunately a fair number of drivers barf on scatterlists
- * that have an offset larger than PAGE_SIZE, despite other
- * subsystems dealing with that invariant just fine. For now
- * stick to the legacy format where we never present those from
- * the block layer, but the code below should be removed once
- * these offenders (mostly MMC/SD drivers) are fixed.
- */
- page += (offset >> PAGE_SHIFT);
- offset &= ~PAGE_MASK;
-
- *sg = blk_next_sg(sg, sglist);
- sg_set_page(*sg, page, len, offset);
-
- total += len;
- nbytes -= len;
- nsegs++;
- }
-
- return nsegs;
-}
-
-static inline int __blk_bvec_map_sg(struct bio_vec bv,
- struct scatterlist *sglist, struct scatterlist **sg)
-{
- *sg = blk_next_sg(sg, sglist);
- sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
- return 1;
-}
-
-/* only try to merge bvecs into one sg if they are from two bios */
-static inline bool
-__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
- struct bio_vec *bvprv, struct scatterlist **sg)
-{
-
- int nbytes = bvec->bv_len;
-
- if (!*sg)
- return false;
-
- if ((*sg)->length + nbytes > queue_max_segment_size(q))
- return false;
-
- if (!biovec_phys_mergeable(q, bvprv, bvec))
- return false;
-
- (*sg)->length += nbytes;
-
- return true;
-}
-
-static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
- struct scatterlist *sglist,
- struct scatterlist **sg)
-{
- struct bio_vec bvec, bvprv = { NULL };
- struct bvec_iter iter;
- int nsegs = 0;
- bool new_bio = false;
-
- for_each_bio(bio) {
- bio_for_each_bvec(bvec, bio, iter) {
- /*
- * Only try to merge bvecs from two bios given we
- * have done bio internal merge when adding pages
- * to bio
- */
- if (new_bio &&
- __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
- goto next_bvec;
-
- if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
- nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
- else
- nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
- next_bvec:
- new_bio = false;
- }
- if (likely(bio->bi_iter.bi_size)) {
- bvprv = bvec;
- new_bio = true;
- }
- }
-
- return nsegs;
-}
-
-/*
- * map a request to scatterlist, return number of sg entries setup. Caller
- * must make sure sg can hold rq->nr_phys_segments entries
- */
-int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist, struct scatterlist **last_sg)
-{
- int nsegs = 0;
-
- if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
- else if (rq->bio)
- nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
-
- if (*last_sg)
- sg_mark_end(*last_sg);
-
- /*
- * Something must have been wrong if the figured number of
- * segment is bigger than number of req's physical segments
- */
- WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
-
- return nsegs;
-}
-EXPORT_SYMBOL(__blk_rq_map_sg);
-
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
struct request_queue *q = rq->q;
- unsigned int max_sectors;
+ struct queue_limits *lim = &q->limits;
+ unsigned int max_sectors, boundary_sectors;
+ bool is_atomic = rq->cmd_flags & REQ_ATOMIC;
if (blk_rq_is_passthrough(rq))
return q->limits.max_hw_sectors;
- max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
- if (!q->limits.chunk_sectors ||
+ boundary_sectors = blk_boundary_sectors(lim, is_atomic);
+ max_sectors = blk_queue_get_max_sectors(rq);
+
+ if (!boundary_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
req_op(rq) == REQ_OP_SECURE_ERASE)
return max_sectors;
return min(max_sectors,
- blk_chunk_sectors_left(offset, q->limits.chunk_sectors));
+ blk_boundary_sectors_left(offset, boundary_sectors));
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
@@ -622,6 +494,9 @@ static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
* counters.
*/
req->nr_phys_segments += nr_phys_segs;
+ if (bio_integrity(bio))
+ req->nr_integrity_segments += blk_rq_count_integrity_sg(req->q,
+ bio);
return 1;
no_merge:
@@ -714,6 +589,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
+ req->nr_integrity_segments += next->nr_integrity_segments;
return 1;
}
@@ -776,9 +652,11 @@ static inline void blk_update_mixed_merge(struct request *req,
static void blk_account_io_merge_request(struct request *req)
{
- if (blk_do_io_stat(req)) {
+ if (req->rq_flags & RQF_IO_STAT) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -794,6 +672,18 @@ static enum elv_merge blk_try_req_merge(struct request *req,
return ELEVATOR_NO_MERGE;
}
+static bool blk_atomic_write_mergeable_rq_bio(struct request *rq,
+ struct bio *bio)
+{
+ return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC);
+}
+
+static bool blk_atomic_write_mergeable_rqs(struct request *rq,
+ struct request *next)
+{
+ return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
+}
+
/*
* For non-mq, this has to be called with the request spinlock acquired.
* For mq with scheduling, the appropriate queue wide lock should be held.
@@ -807,14 +697,13 @@ static struct request *attempt_merge(struct request_queue *q,
if (req_op(req) != req_op(next))
return NULL;
- if (rq_data_dir(req) != rq_data_dir(next))
+ if (req->bio->bi_write_hint != next->bio->bi_write_hint)
return NULL;
-
- /* Don't merge requests with different write hints. */
- if (req->write_hint != next->write_hint)
+ if (req->bio->bi_write_stream != next->bio->bi_write_stream)
return NULL;
-
- if (req->ioprio != next->ioprio)
+ if (req->bio->bi_ioprio != next->bio->bi_ioprio)
+ return NULL;
+ if (!blk_atomic_write_mergeable_rqs(req, next))
return NULL;
/*
@@ -925,27 +814,19 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (req_op(rq) != bio_op(bio))
return false;
- /* different data direction or already started, don't merge */
- if (bio_data_dir(bio) != rq_data_dir(rq))
- return false;
-
- /* don't merge across cgroup boundaries */
if (!blk_cgroup_mergeable(rq, bio))
return false;
-
- /* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
-
- /* Only merge if the crypt contexts are compatible */
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
-
- /* Don't merge requests with different write hints. */
- if (rq->write_hint != bio->bi_write_hint)
+ if (rq->bio->bi_write_hint != bio->bi_write_hint)
return false;
-
- if (rq->ioprio != bio_prio(bio))
+ if (rq->bio->bi_write_stream != bio->bi_write_stream)
+ return false;
+ if (rq->bio->bi_ioprio != bio->bi_ioprio)
+ return false;
+ if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
return false;
return true;
@@ -964,21 +845,14 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
static void blk_account_io_merge_bio(struct request *req)
{
- if (!blk_do_io_stat(req))
- return;
-
- part_stat_lock();
- part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- part_stat_unlock();
+ if (req->rq_flags & RQF_IO_STAT) {
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+ }
}
-enum bio_merge_status {
- BIO_MERGE_OK,
- BIO_MERGE_NONE,
- BIO_MERGE_FAILED,
-};
-
-static enum bio_merge_status bio_attempt_back_merge(struct request *req,
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const blk_opf_t ff = bio_failfast(bio);
@@ -994,6 +868,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
blk_update_mixed_merge(req, bio, false);
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_bio_merged(bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
@@ -1009,6 +886,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
{
const blk_opf_t ff = bio_failfast(bio);
+ /*
+ * A front merge for writes to sequential zones of a zoned block device
+ * can happen only if the user submitted writes out of order. Do not
+ * merge such write to let it fail.
+ */
+ if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ return BIO_MERGE_FAILED;
+
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
@@ -1107,11 +992,10 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
- struct blk_plug *plug;
+ struct blk_plug *plug = current->plug;
struct request *rq;
- plug = blk_mq_plug(bio);
- if (!plug || rq_list_empty(plug->mq_list))
+ if (!plug || rq_list_empty(&plug->mq_list))
return false;
rq_list_for_each(&plug->mq_list, rq) {
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9638b25fd521..444798c5374f 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/group_cpus.h>
+#include <linux/device/bus.h>
#include "blk.h"
#include "blk-mq.h"
@@ -54,3 +55,38 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
return NUMA_NO_NODE;
}
+
+/**
+ * blk_mq_map_hw_queues - Create CPU to hardware queue mapping
+ * @qmap: CPU to hardware queue map
+ * @dev: The device to map queues
+ * @offset: Queue offset to use for the device
+ *
+ * Create a CPU to hardware queue mapping in @qmap. The struct bus_type
+ * irq_get_affinity callback will be used to retrieve the affinity.
+ */
+void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
+ struct device *dev, unsigned int offset)
+
+{
+ const struct cpumask *mask;
+ unsigned int queue, cpu;
+
+ if (!dev->bus->irq_get_affinity)
+ goto fallback;
+
+ for (queue = 0; queue < qmap->nr_queues; queue++) {
+ mask = dev->bus->irq_get_affinity(dev, queue + offset);
+ if (!mask)
+ goto fallback;
+
+ for_each_cpu(cpu, mask)
+ qmap->mq_map[cpu] = qmap->queue_offset + queue;
+ }
+
+ return;
+
+fallback:
+ blk_mq_map_queues(qmap);
+}
+EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues);
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
deleted file mode 100644
index a77b099c34b7..000000000000
--- a/block/blk-mq-debugfs-zoned.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2017 Western Digital Corporation or its affiliates.
- */
-
-#include <linux/blkdev.h>
-#include "blk-mq-debugfs.h"
-
-int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- unsigned int i;
-
- if (!q->disk->seq_zones_wlock)
- return 0;
-
- for (i = 0; i < q->disk->nr_zones; i++)
- if (test_bit(i, q->disk->seq_zones_wlock))
- seq_printf(m, "%u\n", i);
-
- return 0;
-}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..29b3540dd180 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -5,6 +5,7 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
+#include <linux/build_bug.h>
#include <linux/debugfs.h>
#include "blk.h"
@@ -79,33 +80,21 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
static const char *const blk_queue_flag_name[] = {
- QUEUE_FLAG_NAME(STOPPED),
QUEUE_FLAG_NAME(DYING),
QUEUE_FLAG_NAME(NOMERGES),
QUEUE_FLAG_NAME(SAME_COMP),
QUEUE_FLAG_NAME(FAIL_IO),
- QUEUE_FLAG_NAME(NONROT),
- QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(NOXMERGES),
- QUEUE_FLAG_NAME(ADD_RANDOM),
- QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
- QUEUE_FLAG_NAME(STABLE_WRITES),
- QUEUE_FLAG_NAME(POLL),
- QUEUE_FLAG_NAME(WC),
- QUEUE_FLAG_NAME(FUA),
- QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
- QUEUE_FLAG_NAME(PCI_P2PDMA),
- QUEUE_FLAG_NAME(ZONE_RESETALL),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
- QUEUE_FLAG_NAME(NOWAIT),
QUEUE_FLAG_NAME(SQ_SCHED),
- QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
+ QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
+ QUEUE_FLAG_NAME(NO_ELV_SWITCH),
};
#undef QUEUE_FLAG_NAME
@@ -113,6 +102,7 @@ static int queue_state_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
+ BUILD_BUG_ON(ARRAY_SIZE(blk_queue_flag_name) != QUEUE_FLAG_MAX);
blk_flags_show(m, q->queue_flags, blk_queue_flag_name,
ARRAY_SIZE(blk_queue_flag_name));
seq_puts(m, "\n");
@@ -160,7 +150,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
- { "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+ { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL },
{ },
};
@@ -177,45 +167,32 @@ static int hctx_state_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
+ BUILD_BUG_ON(ARRAY_SIZE(hctx_state_name) != BLK_MQ_S_MAX);
blk_flags_show(m, hctx->state, hctx_state_name,
ARRAY_SIZE(hctx_state_name));
seq_puts(m, "\n");
return 0;
}
-#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name
-static const char *const alloc_policy_name[] = {
- BLK_TAG_ALLOC_NAME(FIFO),
- BLK_TAG_ALLOC_NAME(RR),
-};
-#undef BLK_TAG_ALLOC_NAME
-
#define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name
static const char *const hctx_flag_name[] = {
- HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
- HCTX_FLAG_NAME(BLOCKING),
- HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(STACKING),
HCTX_FLAG_NAME(TAG_HCTX_SHARED),
+ HCTX_FLAG_NAME(BLOCKING),
+ HCTX_FLAG_NAME(TAG_RR),
+ HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT),
};
#undef HCTX_FLAG_NAME
static int hctx_flags_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
- const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
- seq_puts(m, "alloc_policy=");
- if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
- alloc_policy_name[alloc_policy])
- seq_puts(m, alloc_policy_name[alloc_policy]);
- else
- seq_printf(m, "%d", alloc_policy);
- seq_puts(m, " ");
- blk_flags_show(m,
- hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy),
- hctx_flag_name, ARRAY_SIZE(hctx_flag_name));
+ BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX));
+
+ blk_flags_show(m, hctx->flags, hctx_flag_name,
+ ARRAY_SIZE(hctx_flag_name));
seq_puts(m, "\n");
return 0;
}
@@ -236,12 +213,17 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(RAHEAD),
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT),
- CMD_FLAG_NAME(NOUNMAP),
CMD_FLAG_NAME(POLLED),
+ CMD_FLAG_NAME(ALLOC_CACHE),
+ CMD_FLAG_NAME(SWAP),
+ CMD_FLAG_NAME(DRV),
+ CMD_FLAG_NAME(FS_PRIVATE),
+ CMD_FLAG_NAME(ATOMIC),
+ CMD_FLAG_NAME(NOUNMAP),
};
#undef CMD_FLAG_NAME
-#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
+#define RQF_NAME(name) [__RQF_##name] = #name
static const char *const rqf_name[] = {
RQF_NAME(STARTED),
RQF_NAME(FLUSH_SEQ),
@@ -256,7 +238,7 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
- RQF_NAME(ZONE_WRITE_LOCKED),
+ RQF_NAME(ZONE_WRITE_PLUGGING),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
@@ -282,6 +264,9 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
const enum req_op op = req_op(rq);
const char *op_str = blk_op_str(op);
+ BUILD_BUG_ON(ARRAY_SIZE(cmd_flag_name) != __REQ_NR_BITS);
+ BUILD_BUG_ON(ARRAY_SIZE(rqf_name) != __RQF_BITS);
+
seq_printf(m, "%p {.op=", rq);
if (strcmp(op_str, "UNKNOWN") == 0)
seq_printf(m, "%u", op);
@@ -364,9 +349,14 @@ static int hctx_busy_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
struct show_busy_params params = { .m = m, .hctx = hctx };
+ int res;
+ res = mutex_lock_interruptible(&hctx->queue->elevator_lock);
+ if (res)
+ return res;
blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq,
&params);
+ mutex_unlock(&hctx->queue->elevator_lock);
return 0;
}
@@ -417,15 +407,14 @@ static int hctx_tags_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->tags)
blk_mq_debugfs_tags_show(m, hctx->tags);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
@@ -434,15 +423,14 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->tags)
sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_sched_tags_show(void *data, struct seq_file *m)
@@ -451,15 +439,14 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->sched_tags)
blk_mq_debugfs_tags_show(m, hctx->sched_tags);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
@@ -468,15 +455,14 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
struct request_queue *q = hctx->queue;
int res;
- res = mutex_lock_interruptible(&q->sysfs_lock);
+ res = mutex_lock_interruptible(&q->elevator_lock);
if (res)
- goto out;
+ return res;
if (hctx->sched_tags)
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
-out:
- return res;
+ return 0;
}
static int hctx_active_show(void *data, struct seq_file *m)
@@ -640,20 +626,9 @@ void blk_mq_debugfs_register(struct request_queue *q)
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
- /*
- * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
- * didn't exist yet (because we don't know what to name the directory
- * until the queue is registered to a gendisk).
- */
- if (q->elevator && !q->sched_debugfs_dir)
- blk_mq_debugfs_register_sched(q);
-
- /* Similarly, blk_mq_init_hctx() couldn't do this previously. */
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->debugfs_dir)
blk_mq_debugfs_register_hctx(q, hctx);
- if (q->elevator && !hctx->sched_debugfs_dir)
- blk_mq_debugfs_register_sched_hctx(q, hctx);
}
if (q->rq_qos) {
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 9c7d4b6117d4..c80e453e3014 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
}
#endif
-#ifdef CONFIG_BLK_DEBUG_FS_ZONED
-int queue_zone_wlock_show(void *data, struct seq_file *m);
+#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
+int queue_zone_wplugs_show(void *data, struct seq_file *m);
#else
-static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+static inline int queue_zone_wplugs_show(void *data, struct seq_file *m)
{
return 0;
}
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
new file mode 100644
index 000000000000..82bae475dfa4
--- /dev/null
+++ b/block/blk-mq-dma.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Christoph Hellwig
+ */
+#include "blk.h"
+
+struct phys_vec {
+ phys_addr_t paddr;
+ u32 len;
+};
+
+static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
+ struct phys_vec *vec)
+{
+ unsigned int max_size;
+ struct bio_vec bv;
+
+ if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ if (!iter->bio)
+ return false;
+ vec->paddr = bvec_phys(&req->special_vec);
+ vec->len = req->special_vec.bv_len;
+ iter->bio = NULL;
+ return true;
+ }
+
+ if (!iter->iter.bi_size)
+ return false;
+
+ bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ vec->paddr = bvec_phys(&bv);
+ max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
+ bv.bv_len = min(bv.bv_len, max_size);
+ bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
+
+ /*
+ * If we are entirely done with this bi_io_vec entry, check if the next
+ * one could be merged into it. This typically happens when moving to
+ * the next bio, but some callers also don't pack bvecs tight.
+ */
+ while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
+ struct bio_vec next;
+
+ if (!iter->iter.bi_size) {
+ if (!iter->bio->bi_next)
+ break;
+ iter->bio = iter->bio->bi_next;
+ iter->iter = iter->bio->bi_iter;
+ }
+
+ next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ if (bv.bv_len + next.bv_len > max_size ||
+ !biovec_phys_mergeable(req->q, &bv, &next))
+ break;
+
+ bv.bv_len += next.bv_len;
+ bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+ }
+
+ vec->len = bv.bv_len;
+ return true;
+}
+
+static inline struct scatterlist *
+blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
+{
+ if (!*sg)
+ return sglist;
+
+ /*
+ * If the driver previously mapped a shorter list, we could see a
+ * termination bit prematurely unless it fully inits the sg table
+ * on each mapping. We KNOW that there must be more entries here
+ * or the driver would be buggy, so force clear the termination bit
+ * to avoid doing a full sg_init_table() in drivers for each command.
+ */
+ sg_unmark_end(*sg);
+ return sg_next(*sg);
+}
+
+/*
+ * Map a request to scatterlist, return number of sg entries setup. Caller
+ * must make sure sg can hold rq->nr_phys_segments entries.
+ */
+int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
+ struct scatterlist **last_sg)
+{
+ struct req_iterator iter = {
+ .bio = rq->bio,
+ };
+ struct phys_vec vec;
+ int nsegs = 0;
+
+ /* the internal flush request may not have bio attached */
+ if (iter.bio)
+ iter.iter = iter.bio->bi_iter;
+
+ while (blk_map_iter_next(rq, &iter, &vec)) {
+ *last_sg = blk_next_sg(last_sg, sglist);
+ sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
+ offset_in_page(vec.paddr));
+ nsegs++;
+ }
+
+ if (*last_sg)
+ sg_mark_end(*last_sg);
+
+ /*
+ * Something must have been wrong if the figured number of
+ * segment is bigger than number of req's physical segments
+ */
+ WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
+
+ return nsegs;
+}
+EXPORT_SYMBOL(__blk_rq_map_sg);
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
deleted file mode 100644
index d47b5c73c9eb..000000000000
--- a/block/blk-mq-pci.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2016 Christoph Hellwig.
- */
-#include <linux/kobject.h>
-#include <linux/blkdev.h>
-#include <linux/blk-mq-pci.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-
-#include "blk-mq.h"
-
-/**
- * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
- * @qmap: CPU to hardware queue map.
- * @pdev: PCI device associated with @set.
- * @offset: Offset to use for the pci irq vector
- *
- * This function assumes the PCI device @pdev has at least as many available
- * interrupt vectors as @set has queues. It will then query the vector
- * corresponding to each queue for it's affinity mask and built queue mapping
- * that maps a queue to the CPUs that have irq affinity for the corresponding
- * vector.
- */
-void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
- int offset)
-{
- const struct cpumask *mask;
- unsigned int queue, cpu;
-
- for (queue = 0; queue < qmap->nr_queues; queue++) {
- mask = pci_irq_get_affinity(pdev, queue + offset);
- if (!mask)
- goto fallback;
-
- for_each_cpu(cpu, mask)
- qmap->mq_map[cpu] = qmap->queue_offset + queue;
- }
-
- return;
-
-fallback:
- WARN_ON_ONCE(qmap->nr_queues > 1);
- blk_mq_clear_mq_map(qmap);
-}
-EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 451a2c1f1f32..55a0fd105147 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
struct request *rq;
LIST_HEAD(hctx_list);
- unsigned int count = 0;
list_for_each_entry(rq, rq_list, queuelist) {
if (rq->mq_hctx != hctx) {
list_cut_before(&hctx_list, rq_list, &rq->queuelist);
goto dispatch;
}
- count++;
}
list_splice_tail_init(rq_list, &hctx_list);
dispatch:
- return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
+ return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
}
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
@@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
} while (!list_empty(&rq_list));
} else {
- dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
+ dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
}
if (busy)
@@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
- } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
+ } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));
WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
@@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
+ if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
return 0;
need_dispatch = true;
} else {
@@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (need_dispatch)
return blk_mq_do_dispatch_ctx(hctx);
blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
+ blk_mq_dispatch_rq_list(hctx, &rq_list, true);
return 0;
}
@@ -349,10 +347,9 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
}
ctx = blk_mq_get_ctx(q);
- hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+ hctx = blk_mq_map_queue(bio->bi_opf, ctx);
type = hctx->type;
- if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
- list_empty_careful(&ctx->rq_lists[type]))
+ if (list_empty_careful(&ctx->rq_lists[type]))
goto out_put;
/* default per sw-queue merge */
@@ -437,6 +434,30 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
return 0;
}
+void blk_mq_sched_reg_debugfs(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_sched(q);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_register_sched_hctx(q, hctx);
+ mutex_unlock(&q->debugfs_mutex);
+}
+
+void blk_mq_sched_unreg_debugfs(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&q->debugfs_mutex);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_unregister_sched_hctx(hctx);
+ blk_mq_debugfs_unregister_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
+}
+
/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
@@ -470,10 +491,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (ret)
goto err_free_map_and_rqs;
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_register_sched(q);
- mutex_unlock(&q->debugfs_mutex);
-
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
@@ -485,11 +502,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return ret;
}
}
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_register_sched_hctx(q, hctx);
- mutex_unlock(&q->debugfs_mutex);
}
-
return 0;
err_free_map_and_rqs:
@@ -528,10 +541,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_unregister_sched_hctx(hctx);
- mutex_unlock(&q->debugfs_mutex);
-
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
@@ -539,12 +548,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
flags = hctx->flags;
}
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_unregister_sched(q);
- mutex_unlock(&q->debugfs_mutex);
-
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q, flags);
+ set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
q->elevator = NULL;
}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 156e9bb07abf..24656980f443 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
if (!entry->show)
return -EIO;
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
res = entry->show(hctx, page);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
return res;
}
@@ -223,30 +223,27 @@ int blk_mq_sysfs_register(struct gendisk *disk)
unsigned long i, j;
int ret;
- lockdep_assert_held(&q->sysfs_dir_lock);
-
ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq");
if (ret < 0)
- goto out;
+ return ret;
kobject_uevent(q->mq_kobj, KOBJ_ADD);
+ mutex_lock(&q->tag_set->tag_list_lock);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
- goto unreg;
+ goto out_unreg;
}
+ mutex_unlock(&q->tag_set->tag_list_lock);
+ return 0;
- q->mq_sysfs_init_done = true;
-
-out:
- return ret;
-
-unreg:
+out_unreg:
queue_for_each_hw_ctx(q, hctx, j) {
if (j < i)
blk_mq_unregister_hctx(hctx);
}
+ mutex_unlock(&q->tag_set->tag_list_lock);
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
@@ -259,15 +256,13 @@ void blk_mq_sysfs_unregister(struct gendisk *disk)
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- lockdep_assert_held(&q->sysfs_dir_lock);
-
+ mutex_lock(&q->tag_set->tag_list_lock);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
+ mutex_unlock(&q->tag_set->tag_list_lock);
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
-
- q->mq_sysfs_init_done = false;
}
void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
@@ -275,15 +270,11 @@ void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- mutex_lock(&q->sysfs_dir_lock);
- if (!q->mq_sysfs_init_done)
- goto unlock;
+ if (!blk_queue_registered(q))
+ return;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
-
-unlock:
- mutex_unlock(&q->sysfs_dir_lock);
}
int blk_mq_sysfs_register_hctxs(struct request_queue *q)
@@ -292,9 +283,8 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q)
unsigned long i;
int ret = 0;
- mutex_lock(&q->sysfs_dir_lock);
- if (!q->mq_sysfs_init_done)
- goto unlock;
+ if (!blk_queue_registered(q))
+ goto out;
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
@@ -302,8 +292,6 @@ int blk_mq_sysfs_register_hctxs(struct request_queue *q)
break;
}
-unlock:
- mutex_unlock(&q->sysfs_dir_lock);
-
+out:
return ret;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index cc57e2dd9a0b..d880c50629d6 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -38,6 +38,7 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
unsigned int users;
+ unsigned long flags;
struct blk_mq_tags *tags = hctx->tags;
/*
@@ -56,11 +57,11 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
return;
}
- spin_lock_irq(&tags->lock);
+ spin_lock_irqsave(&tags->lock, flags);
users = tags->active_queues + 1;
WRITE_ONCE(tags->active_queues, users);
blk_mq_update_wake_batch(tags, users);
- spin_unlock_irq(&tags->lock);
+ spin_unlock_irqrestore(&tags->lock, flags);
}
/*
@@ -189,8 +190,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
sbitmap_finish_wait(bt, ws, &wait);
data->ctx = blk_mq_get_ctx(data->q);
- data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
- data->ctx);
+ data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = &tags->breserved_tags;
@@ -543,30 +543,11 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
node);
}
-int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
- struct sbitmap_queue *breserved_tags,
- unsigned int queue_depth, unsigned int reserved,
- int node, int alloc_policy)
-{
- unsigned int depth = queue_depth - reserved;
- bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
-
- if (bt_alloc(bitmap_tags, depth, round_robin, node))
- return -ENOMEM;
- if (bt_alloc(breserved_tags, reserved, round_robin, node))
- goto free_bitmap_tags;
-
- return 0;
-
-free_bitmap_tags:
- sbitmap_queue_free(bitmap_tags);
- return -ENOMEM;
-}
-
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
- unsigned int reserved_tags,
- int node, int alloc_policy)
+ unsigned int reserved_tags, unsigned int flags, int node)
{
+ unsigned int depth = total_tags - reserved_tags;
+ bool round_robin = flags & BLK_MQ_F_TAG_RR;
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
@@ -581,14 +562,18 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
+ if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
+ goto out_free_tags;
+ if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node))
+ goto out_free_bitmap_tags;
- if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
- total_tags, reserved_tags, node,
- alloc_policy) < 0) {
- kfree(tags);
- return NULL;
- }
return tags;
+
+out_free_bitmap_tags:
+ sbitmap_queue_free(&tags->bitmap_tags);
+out_free_tags:
+ kfree(tags);
+ return NULL;
}
void blk_mq_free_tags(struct blk_mq_tags *tags)
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
deleted file mode 100644
index 68d0945c0b08..000000000000
--- a/block/blk-mq-virtio.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2016 Christoph Hellwig.
- */
-#include <linux/device.h>
-#include <linux/blk-mq-virtio.h>
-#include <linux/virtio_config.h>
-#include <linux/module.h>
-#include "blk-mq.h"
-
-/**
- * blk_mq_virtio_map_queues - provide a default queue mapping for virtio device
- * @qmap: CPU to hardware queue map.
- * @vdev: virtio device to provide a mapping for.
- * @first_vec: first interrupt vectors to use for queues (usually 0)
- *
- * This function assumes the virtio device @vdev has at least as many available
- * interrupt vectors as @set has queues. It will then query the vector
- * corresponding to each queue for it's affinity mask and built queue mapping
- * that maps a queue to the CPUs that have irq affinity for the corresponding
- * vector.
- */
-void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
- struct virtio_device *vdev, int first_vec)
-{
- const struct cpumask *mask;
- unsigned int queue, cpu;
-
- if (!vdev->config->get_vq_affinity)
- goto fallback;
-
- for (queue = 0; queue < qmap->nr_queues; queue++) {
- mask = vdev->config->get_vq_affinity(vdev, first_vec + queue);
- if (!mask)
- goto fallback;
-
- for_each_cpu(cpu, mask)
- qmap->mq_map[cpu] = qmap->queue_offset + queue;
- }
-
- return;
-
-fallback:
- blk_mq_map_queues(qmap);
-}
-EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32afb87efbd0..4806b867e37d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
+#include <linux/sched/isolation.h>
#include <trace/events/block.h>
@@ -42,6 +43,7 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
+static DEFINE_MUTEX(blk_mq_cpuhp_lock);
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
static void blk_mq_request_bypass_insert(struct request *rq,
@@ -87,41 +89,83 @@ struct mq_inflight {
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct request *rq, void *priv)
+static bool blk_mq_check_in_driver(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
- if (rq->part && blk_do_io_stat(rq) &&
- (!mi->part->bd_partno || rq->part == mi->part) &&
+ if (rq->rq_flags & RQF_IO_STAT &&
+ (!bdev_is_partition(mi->part) || rq->part == mi->part) &&
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
return true;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part)
+void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+ blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
+ &mi);
+ inflight[READ] = mi.inflight[READ];
+ inflight[WRITE] = mi.inflight[WRITE];
+}
+
+#ifdef CONFIG_LOCKDEP
+static bool blk_freeze_set_owner(struct request_queue *q,
+ struct task_struct *owner)
+{
+ if (!owner)
+ return false;
- return mi.inflight[0] + mi.inflight[1];
+ if (!q->mq_freeze_depth) {
+ q->mq_freeze_owner = owner;
+ q->mq_freeze_owner_depth = 1;
+ q->mq_freeze_disk_dead = !q->disk ||
+ test_bit(GD_DEAD, &q->disk->state) ||
+ !blk_queue_registered(q);
+ q->mq_freeze_queue_dying = blk_queue_dying(q);
+ return true;
+ }
+
+ if (owner == q->mq_freeze_owner)
+ q->mq_freeze_owner_depth += 1;
+ return false;
}
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2])
+/* verify the last unfreeze in owner context */
+static bool blk_unfreeze_check_owner(struct request_queue *q)
{
- struct mq_inflight mi = { .part = part };
+ if (q->mq_freeze_owner != current)
+ return false;
+ if (--q->mq_freeze_owner_depth == 0) {
+ q->mq_freeze_owner = NULL;
+ return true;
+ }
+ return false;
+}
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
- inflight[0] = mi.inflight[0];
- inflight[1] = mi.inflight[1];
+#else
+
+static bool blk_freeze_set_owner(struct request_queue *q,
+ struct task_struct *owner)
+{
+ return false;
}
-void blk_freeze_queue_start(struct request_queue *q)
+static bool blk_unfreeze_check_owner(struct request_queue *q)
{
+ return false;
+}
+#endif
+
+bool __blk_freeze_queue_start(struct request_queue *q,
+ struct task_struct *owner)
+{
+ bool freeze;
+
mutex_lock(&q->mq_freeze_lock);
+ freeze = blk_freeze_set_owner(q, owner);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
@@ -130,6 +174,14 @@ void blk_freeze_queue_start(struct request_queue *q)
} else {
mutex_unlock(&q->mq_freeze_lock);
}
+
+ return freeze;
+}
+
+void blk_freeze_queue_start(struct request_queue *q)
+{
+ if (__blk_freeze_queue_start(q, current))
+ blk_freeze_acquire_lock(q);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -148,35 +200,17 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
-/*
- * Guarantee no request is in use, so we can change any data structure of
- * the queue afterward.
- */
-void blk_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue_nomemsave(struct request_queue *q)
{
- /*
- * In the !blk_mq case we are only calling this to kill the
- * q_usage_counter, otherwise this increases the freeze depth
- * and waits for it to return to zero. For this reason there is
- * no blk_unfreeze_queue(), and blk_freeze_queue() is not
- * exported to drivers as the only user for unfreeze is blk_mq.
- */
blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave);
-void blk_mq_freeze_queue(struct request_queue *q)
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
- /*
- * ...just an alias to keep freeze and unfreeze actions balanced
- * in the blk_mq_* namespace
- */
- blk_freeze_queue(q);
-}
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
+ bool unfreeze;
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
-{
mutex_lock(&q->mq_freeze_lock);
if (force_atomic)
q->q_usage_counter.data->force_atomic = true;
@@ -186,14 +220,38 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
+ unfreeze = blk_unfreeze_check_owner(q);
mutex_unlock(&q->mq_freeze_lock);
+
+ return unfreeze;
}
-void blk_mq_unfreeze_queue(struct request_queue *q)
+void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q)
+{
+ if (__blk_mq_unfreeze_queue(q, false))
+ blk_unfreeze_release_lock(q);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore);
+
+/*
+ * non_owner variant of blk_freeze_queue_start
+ *
+ * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
+ * by the same task. This is fragile and should not be used if at all
+ * possible.
+ */
+void blk_freeze_queue_start_non_owner(struct request_queue *q)
+{
+ __blk_freeze_queue_start(q, NULL);
+}
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
+
+/* non_owner variant of blk_mq_unfreeze_queue */
+void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
{
__blk_mq_unfreeze_queue(q, false);
}
-EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);
/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
@@ -282,8 +340,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
if (!blk_queue_skip_tagset_quiesce(q))
blk_mq_quiesce_queue_nowait(q);
}
- blk_mq_wait_quiesce_done(set);
mutex_unlock(&set->tag_list_lock);
+
+ blk_mq_wait_quiesce_done(set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
@@ -322,7 +381,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = blk_time_get_ns();
- rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -330,14 +388,9 @@ EXPORT_SYMBOL(blk_rq_init);
/* Set start and alloc time when the allocated request is actually used */
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
- if (blk_mq_need_time_stamp(rq))
- rq->start_time_ns = blk_time_get_ns();
- else
- rq->start_time_ns = 0;
-
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
if (blk_queue_rq_alloc_time(rq->q))
- rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns;
+ rq->alloc_time_ns = alloc_time_ns;
else
rq->alloc_time_ns = 0;
#endif
@@ -358,8 +411,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
if (data->flags & BLK_MQ_REQ_PM)
data->rq_flags |= RQF_PM;
- if (blk_queue_io_stat(q))
- data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
if (data->rq_flags & RQF_SCHED_TAGS) {
@@ -375,9 +426,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->io_start_time_ns = 0;
rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
-#endif
rq->end_io = NULL;
rq->end_io_data = NULL;
@@ -421,7 +470,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
prefetch(tags->static_rqs[tag]);
tag_mask &= ~(1UL << i);
rq = blk_mq_rq_ctx_init(data, tags, tag);
- rq_list_add(data->cached_rq, rq);
+ rq_list_add_head(data->cached_rqs, rq);
nr++;
}
if (!(data->rq_flags & RQF_SCHED_TAGS))
@@ -430,7 +479,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
- return rq_list_pop(data->cached_rq);
+ return rq_list_pop(data->cached_rqs);
}
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
@@ -447,6 +496,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
+retry:
+ data->ctx = blk_mq_get_ctx(q);
+ data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
+
if (q->elevator) {
/*
* All requests use scheduler tags when an I/O scheduler is
@@ -468,13 +521,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
if (ops->limit_depth)
ops->limit_depth(data->cmd_flags, data);
}
- }
-
-retry:
- data->ctx = blk_mq_get_ctx(q);
- data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->rq_flags & RQF_SCHED_TAGS))
+ } else {
blk_mq_tag_busy(data->hctx);
+ }
if (data->flags & BLK_MQ_REQ_RESERVED)
data->rq_flags |= RQF_RESV;
@@ -525,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = plug->nr_ios,
- .cached_rq = &plug->cached_rq,
+ .cached_rqs = &plug->cached_rqs,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -552,14 +605,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
- if (rq_list_empty(plug->cached_rq)) {
+ if (rq_list_empty(&plug->cached_rqs)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
if (!rq)
return NULL;
} else {
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
@@ -568,8 +621,8 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
- plug->cached_rq = rq_list_next(rq);
- blk_mq_rq_time_init(rq, 0);
+ rq_list_pop(&plug->cached_rqs);
+ blk_mq_rq_time_init(rq, blk_time_get_ns());
}
rq->cmd_flags = opf;
@@ -587,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
int ret;
@@ -616,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
u64 alloc_time_ns = 0;
struct request *rq;
@@ -690,6 +753,8 @@ static void blk_mq_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_zone_finish_request(rq);
+
if (rq->rq_flags & RQF_USE_SCHED) {
q->elevator->type->ops.finish_request(rq);
/*
@@ -743,7 +808,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
struct request *rq;
- while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
+ while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
blk_mq_free_request(rq);
}
@@ -761,34 +826,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (unlikely(error)) {
- bio->bi_status = error;
- } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size != nbytes)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- bio_advance(bio, nbytes);
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
-}
-
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
- if (req->part && blk_do_io_stat(req)) {
+ if (req->rq_flags & RQF_IO_STAT) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
@@ -808,7 +848,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
blk_op_str(req_op(req)),
(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
req->nr_phys_segments,
- IOPRIO_PRIO_CLASS(req->ioprio));
+ IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
}
/*
@@ -826,10 +866,8 @@ static void blk_complete_request(struct request *req)
if (!bio)
return;
-#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
- req->q->integrity.profile->complete_fn(req, total_bytes);
-#endif
+ blk_integrity_complete(req, total_bytes);
/*
* Upper layers may call blk_crypto_evict_key() anytime after the last
@@ -845,8 +883,7 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- bio->bi_iter.bi_sector = req->__sector;
+ blk_zone_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -889,6 +926,8 @@ static void blk_complete_request(struct request *req)
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
+ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+ bool quiet = req->rq_flags & RQF_QUIET;
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
@@ -896,11 +935,9 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (!req->bio)
return false;
-#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
error == BLK_STS_OK)
- req->q->integrity.profile->complete_fn(req, nr_bytes);
-#endif
+ blk_integrity_complete(req, nr_bytes);
/*
* Upper layers may call blk_crypto_evict_key() anytime after the last
@@ -909,9 +946,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)) &&
- !test_bit(GD_DEAD, &req->q->disk->state)) {
+ if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
blk_print_req_error(req, error);
trace_block_rq_error(req, error, nr_bytes);
}
@@ -923,12 +959,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size)
+ if (unlikely(error))
+ bio->bi_status = error;
+
+ if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+ } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported
+ * as the BIO fragments may end up not being written
+ * sequentially.
+ */
+ bio->bi_status = BLK_STS_IOERR;
+ }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
+ if (unlikely(quiet))
+ bio_set_flag(bio, BIO_QUIET);
+
+ bio_advance(bio, bio_bytes);
+
+ /* Don't actually finish bio if it's part of flush sequence */
+ if (!bio->bi_iter.bi_size) {
+ blk_zone_update_request_bio(req, bio);
+ if (!is_flush)
+ bio_endio(bio);
+ }
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
@@ -989,38 +1046,76 @@ static inline void blk_account_io_done(struct request *req, u64 now)
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
- if (blk_do_io_stat(req) && req->part &&
- !(req->rq_flags & RQF_FLUSH_SEQ)) {
+ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
+static inline bool blk_rq_passthrough_stats(struct request *req)
+{
+ struct bio *bio = req->bio;
+
+ if (!blk_queue_passthrough_stat(req->q))
+ return false;
+
+ /* Requests without a bio do not transfer data. */
+ if (!bio)
+ return false;
+
+ /*
+ * Stats are accumulated in the bdev, so must have one attached to a
+ * bio to track stats. Most drivers do not set the bdev for passthrough
+ * requests, but nvme is one that will set it.
+ */
+ if (!bio->bi_bdev)
+ return false;
+
+ /*
+ * We don't know what a passthrough command does, but we know the
+ * payload size and data direction. Ensuring the size is aligned to the
+ * block size filters out most commands with payloads that don't
+ * represent sector access.
+ */
+ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
+ return false;
+ return true;
+}
+
static inline void blk_account_io_start(struct request *req)
{
trace_block_io_start(req);
- if (blk_do_io_stat(req)) {
- /*
- * All non-passthrough requests are created from a bio with one
- * exception: when a flush command that is part of a flush sequence
- * generated by the state machine in blk-flush.c is cloned onto the
- * lower device by dm-multipath we can get here without a bio.
- */
- if (req->bio)
- req->part = req->bio->bi_bdev;
- else
- req->part = req->q->disk->part0;
+ if (!blk_queue_io_stat(req->q))
+ return;
+ if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
+ return;
- part_stat_lock();
- update_io_ticks(req->part, jiffies, false);
- part_stat_unlock();
- }
+ req->rq_flags |= RQF_IO_STAT;
+ req->start_time_ns = blk_time_get_ns();
+
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (req->bio)
+ req->part = req->bio->bi_bdev;
+ else
+ req->part = req->q->disk->part0;
+
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, false);
+ part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
+ part_stat_unlock();
}
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
@@ -1129,7 +1224,7 @@ static void blk_complete_reqs(struct llist_head *list)
rq->q->mq_ops->complete(rq);
}
-static __latent_entropy void blk_done_softirq(struct softirq_action *h)
+static __latent_entropy void blk_done_softirq(void)
{
blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
}
@@ -1261,10 +1356,9 @@ void blk_mq_start_request(struct request *rq)
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
rq->mq_hctx->tags->rqs[rq->tag] = rq;
-#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
- q->integrity.profile->prepare_fn(rq);
-#endif
+ blk_integrity_prepare(rq);
+
if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
@@ -1304,8 +1398,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*/
if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
- rq->rq_next = NULL;
- rq_list_add(&plug->mq_list, rq);
+ rq_list_add_tail(&plug->mq_list, rq);
plug->rq_count++;
}
@@ -1330,11 +1423,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
blk_account_io_start(rq);
- /*
- * As plugging can be enabled for passthrough requests on a zoned
- * device, directly accessing the plug instead of using blk_mq_plug()
- * should not have any consequences.
- */
if (current->plug && !at_head) {
blk_add_rq_to_plug(current->plug, rq);
return;
@@ -1462,19 +1550,17 @@ static void blk_mq_requeue_work(struct work_struct *work)
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
+ list_del_init(&rq->queuelist);
/*
- * If RQF_DONTPREP ist set, the request has been started by the
+ * If RQF_DONTPREP is set, the request has been started by the
* driver already and might have driver-specific data allocated
* already. Insert it into the hctx dispatch list to avoid
* block layer merges for the request.
*/
- if (rq->rq_flags & RQF_DONTPREP) {
- list_del_init(&rq->queuelist);
+ if (rq->rq_flags & RQF_DONTPREP)
blk_mq_request_bypass_insert(rq, 0);
- } else {
- list_del_init(&rq->queuelist);
+ else
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
- }
}
while (!list_empty(&flush_list)) {
@@ -1707,7 +1793,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}
-EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
struct dispatch_rq_data {
struct blk_mq_hw_ctx *hctx;
@@ -1921,19 +2006,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
-static void blk_mq_handle_zone_resource(struct request *rq,
- struct list_head *zone_list)
-{
- /*
- * If we end up here it is because we cannot dispatch a request to a
- * specific zone due to LLD level zone-write locking or other zone
- * related resource not being available. In this case, set the request
- * aside in zone_list for retrying it later.
- */
- list_add(&rq->queuelist, zone_list);
- __blk_mq_requeue_request(rq);
-}
-
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
@@ -2012,14 +2084,13 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
- unsigned int nr_budgets)
+ bool get_budget)
{
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
struct request *rq;
int queued;
blk_status_t ret = BLK_STS_OK;
- LIST_HEAD(zone_list);
bool needs_resource = false;
if (list_empty(list))
@@ -2035,7 +2106,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
WARN_ON_ONCE(hctx != rq->mq_hctx);
- prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+ prep = blk_mq_prep_dispatch_rq(rq, get_budget);
if (prep != PREP_DISPATCH_OK)
break;
@@ -2044,12 +2115,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
bd.rq = rq;
bd.last = list_empty(list);
- /*
- * once the request is queued to lld, no need to cover the
- * budget any more
- */
- if (nr_budgets)
- nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
@@ -2061,23 +2126,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
- case BLK_STS_ZONE_RESOURCE:
- /*
- * Move the request to zone_list and keep going through
- * the dispatch list to find more requests the drive can
- * accept.
- */
- blk_mq_handle_zone_resource(rq, &zone_list);
- needs_resource = true;
- break;
default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out:
- if (!list_empty(&zone_list))
- list_splice_tail_init(&zone_list, list);
-
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
@@ -2095,7 +2148,11 @@ out:
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
blk_mq_is_shared_tags(hctx->flags));
- if (nr_budgets)
+ /*
+ * If the caller allocated budgets, free the budgets of the
+ * requests that have not yet been passed to the block driver.
+ */
+ if (!get_budget)
blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
@@ -2164,6 +2221,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
}
/*
+ * ->next_cpu is always calculated from hctx->cpumask, so simply use
+ * it for speeding up the check
+ */
+static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
+{
+ return hctx->next_cpu >= nr_cpu_ids;
+}
+
+/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
@@ -2174,7 +2240,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
bool tried = false;
int next_cpu = hctx->next_cpu;
- if (hctx->queue->nr_hw_queues == 1)
+ /* Switch to unbound if no allowable CPUs in this hctx */
+ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
@@ -2225,6 +2292,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
+static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
+{
+ bool need_run;
+
+ /*
+ * When queue is quiesced, we may be switching io scheduler, or
+ * updating nr_hw_queues, or other things, and we can't run queue
+ * any more, even blk_mq_hctx_has_pending() can't be called safely.
+ *
+ * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+ * quiesced.
+ */
+ __blk_mq_run_dispatch_ops(hctx->queue, false,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
+ return need_run;
+}
+
/**
* blk_mq_run_hw_queue - Start to run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
@@ -2245,20 +2330,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
- /*
- * When queue is quiesced, we may be switching io scheduler, or
- * updating nr_hw_queues, or other things, and we can't run queue
- * any more, even __blk_mq_hctx_has_pending() can't be called safely.
- *
- * And queue will be rerun in blk_mq_unquiesce_queue() if it is
- * quiesced.
- */
- __blk_mq_run_dispatch_ops(hctx->queue, false,
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx));
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ if (!need_run) {
+ unsigned long flags;
- if (!need_run)
- return;
+ /*
+ * Synchronize with blk_mq_unquiesce_queue(), because we check
+ * if hw queue is quiesced locklessly above, we need the use
+ * ->queue_lock to make sure we see the up-to-date status to
+ * not miss rerunning the hw queue.
+ */
+ spin_lock_irqsave(&hctx->queue->queue_lock, flags);
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
+
+ if (!need_run)
+ return;
+ }
if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0);
@@ -2415,6 +2503,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
return;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ /*
+ * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
+ * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
+ * list in the subsequent routine.
+ */
+ smp_mb__after_atomic();
blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
@@ -2566,9 +2660,13 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK;
+ rq->bio = rq->biotail = bio;
rq->__sector = bio->bi_iter.bi_sector;
- rq->write_hint = bio->bi_write_hint;
- blk_rq_bio_prep(rq, bio, nr_segs);
+ rq->__data_len = bio->bi_iter.bi_size;
+ rq->nr_phys_segments = nr_segs;
+ if (bio_integrity(bio))
+ rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
+ bio);
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
@@ -2642,6 +2740,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return;
}
@@ -2672,6 +2771,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return BLK_STS_OK;
}
@@ -2680,15 +2780,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
return __blk_mq_issue_directly(hctx, rq, last);
}
-static void blk_mq_plug_issue_direct(struct blk_plug *plug)
+static void blk_mq_issue_direct(struct rq_list *rqs)
{
struct blk_mq_hw_ctx *hctx = NULL;
struct request *rq;
int queued = 0;
blk_status_t ret = BLK_STS_OK;
- while ((rq = rq_list_pop(&plug->mq_list))) {
- bool last = rq_list_empty(plug->mq_list);
+ while ((rq = rq_list_pop(rqs))) {
+ bool last = rq_list_empty(rqs);
if (hctx != rq->mq_hctx) {
if (hctx) {
@@ -2719,26 +2819,74 @@ out:
blk_mq_commit_rqs(hctx, queued, false);
}
-static void __blk_mq_flush_plug_list(struct request_queue *q,
- struct blk_plug *plug)
+static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
{
if (blk_queue_quiesced(q))
return;
- q->mq_ops->queue_rqs(&plug->mq_list);
+ q->mq_ops->queue_rqs(rqs);
+}
+
+static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
+ struct rq_list *queue_rqs)
+{
+ struct request *rq = rq_list_pop(rqs);
+ struct request_queue *this_q = rq->q;
+ struct request **prev = &rqs->head;
+ struct rq_list matched_rqs = {};
+ struct request *last = NULL;
+ unsigned depth = 1;
+
+ rq_list_add_tail(&matched_rqs, rq);
+ while ((rq = *prev)) {
+ if (rq->q == this_q) {
+ /* move rq from rqs to matched_rqs */
+ *prev = rq->rq_next;
+ rq_list_add_tail(&matched_rqs, rq);
+ depth++;
+ } else {
+ /* leave rq in rqs */
+ prev = &rq->rq_next;
+ last = rq;
+ }
+ }
+
+ rqs->tail = last;
+ *queue_rqs = matched_rqs;
+ return depth;
}
-static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
+{
+ struct request_queue *q = rq_list_peek(rqs)->q;
+
+ trace_block_unplug(q, depth, true);
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole list in one go.
+ * We already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ */
+ if (q->mq_ops->queue_rqs) {
+ blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
+ if (rq_list_empty(rqs))
+ return;
+ }
+
+ blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
+}
+
+static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
- struct request *requeue_list = NULL;
- struct request **requeue_lastp = &requeue_list;
+ struct rq_list requeue_list = {};
unsigned int depth = 0;
bool is_passthrough = false;
LIST_HEAD(list);
do {
- struct request *rq = rq_list_pop(&plug->mq_list);
+ struct request *rq = rq_list_pop(rqs);
if (!this_hctx) {
this_hctx = rq->mq_hctx;
@@ -2746,14 +2894,14 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
is_passthrough = blk_rq_is_passthrough(rq);
} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
is_passthrough != blk_rq_is_passthrough(rq)) {
- rq_list_add_tail(&requeue_lastp, rq);
+ rq_list_add_tail(&requeue_list, rq);
continue;
}
- list_add(&rq->queuelist, &list);
+ list_add_tail(&rq->queuelist, &list);
depth++;
- } while (!rq_list_empty(plug->mq_list));
+ } while (!rq_list_empty(rqs));
- plug->mq_list = requeue_list;
+ *rqs = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
@@ -2773,9 +2921,22 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
percpu_ref_put(&this_hctx->queue->q_usage_counter);
}
+static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
+{
+ do {
+ struct rq_list queue_rqs;
+ unsigned depth;
+
+ depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
+ blk_mq_dispatch_queue_requests(&queue_rqs, depth);
+ while (!rq_list_empty(&queue_rqs))
+ blk_mq_dispatch_list(&queue_rqs, false);
+ } while (!rq_list_empty(rqs));
+}
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct request *rq;
+ unsigned int depth;
/*
* We may have been called recursively midway through handling
@@ -2786,36 +2947,23 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
if (plug->rq_count == 0)
return;
+ depth = plug->rq_count;
plug->rq_count = 0;
- if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- struct request_queue *q;
-
- rq = rq_list_peek(&plug->mq_list);
- q = rq->q;
-
- /*
- * Peek first request and see if we have a ->queue_rqs() hook.
- * If we do, we can dispatch the whole plug list in one go. We
- * already know at this point that all requests belong to the
- * same queue, caller must ensure that's the case.
- */
- if (q->mq_ops->queue_rqs) {
- blk_mq_run_dispatch_ops(q,
- __blk_mq_flush_plug_list(q, plug));
- if (rq_list_empty(plug->mq_list))
- return;
+ if (!plug->has_elevator && !from_schedule) {
+ if (plug->multiple_queues) {
+ blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
+ return;
}
- blk_mq_run_dispatch_ops(q,
- blk_mq_plug_issue_direct(plug));
- if (rq_list_empty(plug->mq_list))
+ blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
+ if (rq_list_empty(&plug->mq_list))
return;
}
do {
- blk_mq_dispatch_plug_list(plug, from_schedule);
- } while (!rq_list_empty(plug->mq_list));
+ blk_mq_dispatch_list(&plug->mq_list, from_schedule);
+ } while (!rq_list_empty(&plug->mq_list));
}
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2865,13 +3013,18 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q,
static struct request *blk_mq_get_new_requests(struct request_queue *q,
struct blk_plug *plug,
- struct bio *bio,
- unsigned int nsegs)
+ struct bio *bio)
{
struct blk_mq_alloc_data data = {
.q = q,
- .nr_tags = 1,
+ .flags = 0,
+ .shallow_depth = 0,
.cmd_flags = bio->bi_opf,
+ .rq_flags = 0,
+ .nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -2880,16 +3033,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
- data.cached_rq = &plug->cached_rq;
+ data.cached_rqs = &plug->cached_rqs;
}
rq = __blk_mq_alloc_requests(&data);
- if (rq)
- return rq;
- rq_qos_cleanup(q, bio);
- if (bio->bi_opf & REQ_NOWAIT)
- bio_wouldblock_error(bio);
- return NULL;
+ if (unlikely(!rq))
+ rq_qos_cleanup(q, bio);
+ return rq;
}
/*
@@ -2903,7 +3053,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
if (!plug)
return NULL;
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
@@ -2917,21 +3067,32 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
- WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+ if (rq_list_pop(&plug->cached_rqs) != rq)
+ WARN_ON_ONCE(1);
/*
* If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry
* before we throttle.
*/
- plug->cached_rq = rq_list_next(rq);
rq_qos_throttle(rq->q, bio);
- blk_mq_rq_time_init(rq, 0);
+ blk_mq_rq_time_init(rq, blk_time_get_ns());
rq->cmd_flags = bio->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
}
+static bool bio_unaligned(const struct bio *bio, struct request_queue *q)
+{
+ unsigned int bs_mask = queue_logical_block_size(q) - 1;
+
+ /* .bi_sector of any zero sized bio need to be initialized */
+ if ((bio->bi_iter.bi_size & bs_mask) ||
+ ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask))
+ return true;
+ return false;
+}
+
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2948,44 +3109,79 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(bio);
+ struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
- unsigned int nr_segs = 1;
+ unsigned int nr_segs;
struct request *rq;
blk_status_t ret;
- bio = blk_queue_bounce(bio, q);
+ /*
+ * If the plug has a cached request for this queue, try to use it.
+ */
+ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+ /*
+ * A BIO that was released from a zone write plug has already been
+ * through the preparation in this function, already holds a reference
+ * on the queue usage counter, and is the only write BIO in-flight for
+ * the target zone. Go straight to preparing a request for it.
+ */
+ if (bio_zone_write_plugging(bio)) {
+ nr_segs = bio->__bi_nr_segments;
+ if (rq)
+ blk_queue_exit(q);
+ goto new_request;
+ }
/*
- * If the plug has a cached request for this queue, try use it.
- *
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
*/
- rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
}
- if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
- bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
- if (!bio)
- goto queue_exit;
+ /*
+ * Device reconfiguration may change logical block size or reduce the
+ * number of poll queues, so the checks for alignment and poll support
+ * have to be done with queue usage counter held.
+ */
+ if (unlikely(bio_unaligned(bio, q))) {
+ bio_io_error(bio);
+ goto queue_exit;
}
+
+ if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ bio_endio(bio);
+ goto queue_exit;
+ }
+
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ goto queue_exit;
+
if (!bio_integrity_prep(bio))
goto queue_exit;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
- if (!rq) {
- rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
- if (unlikely(!rq))
- goto queue_exit;
- } else {
+ if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+
+new_request:
+ if (rq) {
blk_mq_use_cached_rq(rq, plug, bio);
+ } else {
+ rq = blk_mq_get_new_requests(q, plug, bio);
+ if (unlikely(!rq)) {
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
+ goto queue_exit;
+ }
}
trace_block_getrq(bio);
@@ -3002,6 +3198,9 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_init_request(rq);
+
if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
@@ -3037,7 +3236,7 @@ queue_exit:
blk_status_t blk_insert_cloned_request(struct request *rq)
{
struct request_queue *q = rq->q;
- unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ unsigned int max_sectors = blk_queue_get_max_sectors(rq);
unsigned int max_segments = blk_rq_get_max_segments(rq);
blk_status_t ret;
@@ -3134,19 +3333,21 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
int (*bio_ctr)(struct bio *, struct bio *, void *),
void *data)
{
- struct bio *bio, *bio_src;
+ struct bio *bio_src;
if (!bs)
bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
- bs);
+ struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src,
+ gfp_mask, bs);
if (!bio)
goto free_and_out;
- if (bio_ctr && bio_ctr(bio, bio_src, data))
+ if (bio_ctr && bio_ctr(bio, bio_src, data)) {
+ bio_put(bio);
goto free_and_out;
+ }
if (rq->bio) {
rq->biotail->bi_next = bio;
@@ -3154,7 +3355,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
} else {
rq->bio = rq->biotail = bio;
}
- bio = NULL;
}
/* Copy attributes of the original request to the clone request. */
@@ -3165,8 +3365,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
rq->special_vec = rq_src->special_vec;
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
- rq->ioprio = rq_src->ioprio;
- rq->write_hint = rq_src->write_hint;
+ rq->nr_integrity_segments = rq_src->nr_integrity_segments;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
@@ -3174,8 +3373,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
return 0;
free_and_out:
- if (bio)
- bio_put(bio);
blk_rq_unprep_clone(rq);
return -ENOMEM;
@@ -3338,8 +3535,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node);
if (!tags)
return NULL;
@@ -3483,14 +3679,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
return data.has_rq;
}
-static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
- struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
+ unsigned int this_cpu)
{
- if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
- return false;
- if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
- return false;
- return true;
+ enum hctx_type type = hctx->type;
+ int cpu;
+
+ /*
+ * hctx->cpumask has to rule out isolated CPUs, but userspace still
+ * might submit IOs on these isolated CPUs, so use the queue map to
+ * check if all CPUs mapped to this hctx are offline
+ */
+ for_each_online_cpu(cpu) {
+ struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
+ type, cpu);
+
+ if (h != hctx)
+ continue;
+
+ /* this hctx has at least one online CPU */
+ if (this_cpu != cpu)
+ return true;
+ }
+
+ return false;
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
@@ -3498,8 +3710,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
- if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
- !blk_mq_last_cpu_in_hctx(cpu, hctx))
+ if (blk_mq_hctx_has_online_cpu(hctx, cpu))
return 0;
/*
@@ -3526,12 +3737,28 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
return 0;
}
+/*
+ * Check if one CPU is mapped to the specified hctx
+ *
+ * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed
+ * to be used for scheduling kworker only. For other usage, please call this
+ * helper for checking if one CPU belongs to the specified hctx
+ */
+static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu,
+ const struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
+ hctx->type, cpu);
+
+ return mapped_hctx == hctx;
+}
+
static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
- if (cpumask_test_cpu(cpu, hctx->cpumask))
+ if (blk_mq_cpu_mapped_to_hctx(cpu, hctx))
clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
return 0;
}
@@ -3549,7 +3776,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
enum hctx_type type;
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
- if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx))
return 0;
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -3573,13 +3800,91 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
return 0;
}
-static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
+static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
- if (!(hctx->flags & BLK_MQ_F_STACKING))
+ lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+ if (!(hctx->flags & BLK_MQ_F_STACKING) &&
+ !hlist_unhashed(&hctx->cpuhp_online)) {
cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
- cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
- &hctx->cpuhp_dead);
+ INIT_HLIST_NODE(&hctx->cpuhp_online);
+ }
+
+ if (!hlist_unhashed(&hctx->cpuhp_dead)) {
+ cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+ &hctx->cpuhp_dead);
+ INIT_HLIST_NODE(&hctx->cpuhp_dead);
+ }
+}
+
+static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
+{
+ mutex_lock(&blk_mq_cpuhp_lock);
+ __blk_mq_remove_cpuhp(hctx);
+ mutex_unlock(&blk_mq_cpuhp_lock);
+}
+
+static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx)
+{
+ lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+ if (!(hctx->flags & BLK_MQ_F_STACKING) &&
+ hlist_unhashed(&hctx->cpuhp_online))
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
+
+ if (hlist_unhashed(&hctx->cpuhp_dead))
+ cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD,
+ &hctx->cpuhp_dead);
+}
+
+static void __blk_mq_remove_cpuhp_list(struct list_head *head)
+{
+ struct blk_mq_hw_ctx *hctx;
+
+ lockdep_assert_held(&blk_mq_cpuhp_lock);
+
+ list_for_each_entry(hctx, head, hctx_list)
+ __blk_mq_remove_cpuhp(hctx);
+}
+
+/*
+ * Unregister cpuhp callbacks from exited hw queues
+ *
+ * Safe to call if this `request_queue` is live
+ */
+static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q)
+{
+ LIST_HEAD(hctx_list);
+
+ spin_lock(&q->unused_hctx_lock);
+ list_splice_init(&q->unused_hctx_list, &hctx_list);
+ spin_unlock(&q->unused_hctx_lock);
+
+ mutex_lock(&blk_mq_cpuhp_lock);
+ __blk_mq_remove_cpuhp_list(&hctx_list);
+ mutex_unlock(&blk_mq_cpuhp_lock);
+
+ spin_lock(&q->unused_hctx_lock);
+ list_splice(&hctx_list, &q->unused_hctx_list);
+ spin_unlock(&q->unused_hctx_lock);
+}
+
+/*
+ * Register cpuhp callbacks from all hw queues
+ *
+ * Safe to call if this `request_queue` is live
+ */
+static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&blk_mq_cpuhp_lock);
+ queue_for_each_hw_ctx(q, hctx, i)
+ __blk_mq_add_cpuhp(hctx);
+ mutex_unlock(&blk_mq_cpuhp_lock);
}
/*
@@ -3630,8 +3935,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
- blk_mq_remove_cpuhp(hctx);
-
xa_erase(&q->hctx_table, hctx_idx);
spin_lock(&q->unused_hctx_lock);
@@ -3648,6 +3951,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
+ blk_mq_remove_cpuhp(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
}
}
@@ -3658,16 +3962,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
{
hctx->queue_num = hctx_idx;
- if (!(hctx->flags & BLK_MQ_F_STACKING))
- cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
- &hctx->cpuhp_online);
- cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
-
hctx->tags = set->tags[hctx_idx];
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
- goto unregister_cpu_notifier;
+ goto fail;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
@@ -3684,8 +3983,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
- unregister_cpu_notifier:
- blk_mq_remove_cpuhp(hctx);
+ fail:
return -1;
}
@@ -3711,6 +4009,8 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
+ INIT_HLIST_NODE(&hctx->cpuhp_dead);
+ INIT_HLIST_NODE(&hctx->cpuhp_online);
hctx->queue = q;
hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
@@ -3907,6 +4207,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}
queue_for_each_hw_ctx(q, hctx, i) {
+ int cpu;
+
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
@@ -3934,6 +4236,15 @@ static void blk_mq_map_swqueue(struct request_queue *q)
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
+ * Rule out isolated CPUs from hctx->cpumask to avoid
+ * running block kworker on isolated CPUs
+ */
+ for_each_cpu(cpu, hctx->cpumask) {
+ if (cpu_is_isolated(cpu))
+ cpumask_clear_cpu(cpu, hctx->cpumask);
+ }
+
+ /*
* Initialize batch roundrobin counts
*/
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
@@ -3964,13 +4275,14 @@ static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
bool shared)
{
struct request_queue *q;
+ unsigned int memflags;
lockdep_assert_held(&set->tag_list_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
}
}
@@ -4075,7 +4387,13 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
+ if (!lim)
+ lim = &default_lim;
+ lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
+ if (set->nr_maps > HCTX_TYPE_POLL)
+ lim->features |= BLK_FEAT_POLL;
+
+ q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
return q;
q->queuedata = queuedata;
@@ -4151,6 +4469,15 @@ struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
+/*
+ * Only hctx removed from cpuhp list can be reused
+ */
+static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx)
+{
+ return hlist_unhashed(&hctx->cpuhp_online) &&
+ hlist_unhashed(&hctx->cpuhp_dead);
+}
+
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
int hctx_idx, int node)
@@ -4160,7 +4487,7 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
/* reuse dead hctx first */
spin_lock(&q->unused_hctx_lock);
list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
- if (tmp->numa_node == node) {
+ if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) {
hctx = tmp;
break;
}
@@ -4185,14 +4512,12 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
return NULL;
}
-static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
- struct request_queue *q)
+static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i, j;
- /* protect against switching io scheduler */
- mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
@@ -4225,18 +4550,18 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
xa_for_each_start(&q->hctx_table, j, hctx, j)
blk_mq_exit_hctx(q, set, hctx, j);
- mutex_unlock(&q->sysfs_lock);
}
-static void blk_mq_update_poll_flag(struct request_queue *q)
+static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
- struct blk_mq_tag_set *set = q->tag_set;
+ __blk_mq_realloc_hw_ctxs(set, q);
- if (set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+ /* unregister cpuhp callbacks for exited hctxs */
+ blk_mq_remove_hw_queues_cpuhp(q);
+
+ /* register cpuhp for new initialized hctxs */
+ blk_mq_add_hw_queues_cpuhp(q);
}
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
@@ -4245,6 +4570,12 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
+ /*
+ * ->tag_set has to be setup before initialize hctx, which cpuphp
+ * handler needs it for checking queue mapping
+ */
+ q->tag_set = set;
+
if (blk_mq_alloc_ctxs(q))
goto err_exit;
@@ -4263,10 +4594,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
- q->tag_set = set;
-
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
- blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->flush_list);
@@ -4276,8 +4604,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
- blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
+ blk_mq_add_queue_tag_set(set, q);
return 0;
err_hctxs:
@@ -4497,6 +4825,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
goto out_free_srcu;
}
+ init_rwsem(&set->update_nr_hwq_lock);
+
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues,
sizeof(struct blk_mq_tags *), GFP_KERNEL,
@@ -4590,13 +4920,15 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
int ret;
unsigned long i;
+ if (WARN_ON_ONCE(!q->mq_freeze_depth))
+ return -EINVAL;
+
if (!set)
return -EINVAL;
if (q->nr_requests == nr)
return 0;
- blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
ret = 0;
@@ -4630,95 +4962,16 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
}
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
return ret;
}
-/*
- * request_queue and elevator_type pair.
- * It is just used by __blk_mq_update_nr_hw_queues to cache
- * the elevator_type associated with a request_queue.
- */
-struct blk_mq_qe_pair {
- struct list_head node;
- struct request_queue *q;
- struct elevator_type *type;
-};
-
-/*
- * Cache the elevator_type in qe pair list and switch the
- * io scheduler to 'none'
- */
-static bool blk_mq_elv_switch_none(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
-
- qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
- if (!qe)
- return false;
-
- /* q->elevator needs protection from ->sysfs_lock */
- mutex_lock(&q->sysfs_lock);
-
- /* the check has to be done with holding sysfs_lock */
- if (!q->elevator) {
- kfree(qe);
- goto unlock;
- }
-
- INIT_LIST_HEAD(&qe->node);
- qe->q = q;
- qe->type = q->elevator->type;
- /* keep a reference to the elevator module as we'll switch back */
- __elevator_get(qe->type);
- list_add(&qe->node, head);
- elevator_disable(q);
-unlock:
- mutex_unlock(&q->sysfs_lock);
-
- return true;
-}
-
-static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
-
- list_for_each_entry(qe, head, node)
- if (qe->q == q)
- return qe;
-
- return NULL;
-}
-
-static void blk_mq_elv_switch_back(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
- struct elevator_type *t;
-
- qe = blk_lookup_qe_pair(head, q);
- if (!qe)
- return;
- t = qe->type;
- list_del(&qe->node);
- kfree(qe);
-
- mutex_lock(&q->sysfs_lock);
- elevator_switch(q, t);
- /* drop the reference acquired in blk_mq_elv_switch_none */
- elevator_put(t);
- mutex_unlock(&q->sysfs_lock);
-}
-
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
- LIST_HEAD(head);
int prev_nr_hw_queues = set->nr_hw_queues;
+ unsigned int memflags;
int i;
lockdep_assert_held(&set->tag_list_lock);
@@ -4730,30 +4983,26 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
return;
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_freeze_queue(q);
- /*
- * Switch IO scheduler to 'none', cleaning up the data associated
- * with the previous scheduler. We will switch back once we are done
- * updating the new sw to hw queue mappings.
- */
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- if (!blk_mq_elv_switch_none(&head, q))
- goto switch_back;
-
+ memflags = memalloc_noio_save();
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
}
- if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_freeze_queue_nomemsave(q);
+
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_unfreeze_queue_nomemrestore(q);
goto reregister;
+ }
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_realloc_hw_ctxs(set, q);
- blk_mq_update_poll_flag(q);
+ __blk_mq_realloc_hw_ctxs(set, q);
+
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@@ -4768,18 +5017,19 @@ fallback:
blk_mq_map_swqueue(q);
}
+ /* elv_update_nr_hw_queues() unfreeze queue for us */
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ elv_update_nr_hw_queues(q);
+
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
- }
-switch_back:
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_elv_switch_back(&head, q);
-
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue(q);
+ blk_mq_remove_hw_queues_cpuhp(q);
+ blk_mq_add_hw_queues_cpuhp(q);
+ }
+ memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
@@ -4788,9 +5038,11 @@ switch_back:
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
+ down_write(&set->update_nr_hwq_lock);
mutex_lock(&set->tag_list_lock);
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
mutex_unlock(&set->tag_list_lock);
+ up_write(&set->update_nr_hwq_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
@@ -4824,9 +5076,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
struct io_comp_batch *iob, unsigned int flags)
{
- struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
-
- return blk_hctx_poll(q, hctx, iob, flags);
+ if (!blk_mq_can_poll(q))
+ return 0;
+ return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
}
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f75a9ecfebde..affb2e14b56e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,6 +36,8 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
+#define BLK_MQ_CPU_WORK_BATCH (8)
+
typedef unsigned int __bitwise blk_insert_t;
#define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01)
@@ -46,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
- unsigned int);
+ bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@@ -98,12 +100,10 @@ static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
- * @q: request queue
* @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
* @ctx: software queue cpu ctx
*/
-static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- blk_opf_t opf,
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf,
struct blk_mq_ctx *ctx)
{
return ctx->hctxs[blk_mq_get_hctx_type(opf)];
@@ -153,7 +153,7 @@ struct blk_mq_alloc_data {
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
- struct request **cached_rq;
+ struct rq_list *cached_rqs;
/* input & output parameter */
struct blk_mq_ctx *ctx;
@@ -161,11 +161,8 @@ struct blk_mq_alloc_data {
};
struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
- unsigned int reserved_tags, int node, int alloc_policy);
+ unsigned int reserved_tags, unsigned int flags, int node);
void blk_mq_free_tags(struct blk_mq_tags *tags);
-int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
- struct sbitmap_queue *breserved_tags, unsigned int queue_depth,
- unsigned int reserved, int node, int alloc_policy);
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
@@ -228,6 +225,19 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
+ /* Fast path: hardware queue is not stopped most of the time. */
+ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+ return false;
+
+ /*
+ * This barrier is used to order adding of dispatch list before and
+ * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier
+ * in blk_mq_start_stopped_hw_queue() so that dispatch code could
+ * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not
+ * empty to avoid missing dispatching requests.
+ */
+ smp_mb();
+
return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
@@ -236,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part);
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2]);
+void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]);
static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
int budget_token)
@@ -365,37 +372,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
-/*
- * blk_mq_plug() - Get caller context plug
- * @bio : the bio being submitted by the caller context
- *
- * Plugging, by design, may delay the insertion of BIOs into the elevator in
- * order to increase BIO merging opportunities. This however can cause BIO
- * insertion order to change from the order in which submit_bio() is being
- * executed in the case of multiple contexts concurrently issuing BIOs to a
- * device, even if these context are synchronized to tightly control BIO issuing
- * order. While this is not a problem with regular block devices, this ordering
- * change can cause write BIO failures with zoned block devices as these
- * require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if it is for a zoned block
- * device and the BIO to plug is a write operation.
- *
- * Return current->plug if the bio can be plugged and NULL otherwise
- */
-static inline struct blk_plug *blk_mq_plug( struct bio *bio)
-{
- /* Zoned block device write operation case: do not plug the BIO */
- if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
- return NULL;
-
- /*
- * For regular block devices or read operations, use the context plug
- * which may be NULL if blk_start_plug() was not executed.
- */
- return current->plug;
-}
-
/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
@@ -467,4 +443,10 @@ do { \
#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
__blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
+static inline bool blk_mq_can_poll(struct request_queue *q)
+{
+ return (q->limits.features & BLK_FEAT_POLL) &&
+ q->tag_set->map[HCTX_TYPE_POLL].nr_queues;
+}
+
#endif
diff --git a/block/blk-pm.c b/block/blk-pm.c
index 42e842074715..8d3e052f91da 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -89,7 +89,7 @@ int blk_pre_runtime_suspend(struct request_queue *q)
if (percpu_ref_is_zero(&q->q_usage_counter))
ret = 0;
/* Switch q_usage_counter back to per-cpu mode. */
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue_nomemrestore(q);
if (ret < 0) {
spin_lock_irq(&q->queue_lock);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index dd7310c94713..848591fb3c57 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -2,6 +2,8 @@
#include "blk-rq-qos.h"
+__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
+
/*
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
* false if 'v' + 1 would be bigger than 'below'.
@@ -196,7 +198,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
struct rq_qos_wait_data {
struct wait_queue_entry wq;
- struct task_struct *task;
struct rq_wait *rqw;
acquire_inflight_cb_t *cb;
void *private_data;
@@ -218,9 +219,21 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
return -1;
data->got_token = true;
- smp_wmb();
- list_del_init(&curr->entry);
- wake_up_process(data->task);
+ /*
+ * autoremove_wake_function() removes the wait entry only when it
+ * actually changed the task state. We want the wait always removed.
+ * Remove explicitly and use default_wake_function().
+ */
+ default_wake_function(curr, mode, wake_flags, key);
+ /*
+ * Note that the order of operations is important as finish_wait()
+ * tests whether @curr is removed without grabbing the lock. This
+ * should be the last thing to do to make sure we will not have a
+ * UAF access to @data. And the semantics of memory barrier in it
+ * also make sure the waiter will see the latest @data->got_token
+ * once list_empty_careful() in finish_wait() returns true.
+ */
+ list_del_init_careful(&curr->entry);
return 1;
}
@@ -245,42 +258,55 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
cleanup_cb_t *cleanup_cb)
{
struct rq_qos_wait_data data = {
- .wq = {
- .func = rq_qos_wake_function,
- .entry = LIST_HEAD_INIT(data.wq.entry),
- },
- .task = current,
- .rqw = rqw,
- .cb = acquire_inflight_cb,
- .private_data = private_data,
+ .rqw = rqw,
+ .cb = acquire_inflight_cb,
+ .private_data = private_data,
+ .got_token = false,
};
- bool has_sleeper;
+ bool first_waiter;
- has_sleeper = wq_has_sleeper(&rqw->wait);
- if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
+ /*
+ * If there are no waiters in the waiting queue, try to increase the
+ * inflight counter if we can. Otherwise, prepare for adding ourselves
+ * to the waiting queue.
+ */
+ if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data))
return;
- has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
+ init_wait_func(&data.wq, rq_qos_wake_function);
+ first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq,
TASK_UNINTERRUPTIBLE);
+ /*
+ * Make sure there is at least one inflight process; otherwise, waiters
+ * will never be woken up. Since there may be no inflight process before
+ * adding ourselves to the waiting queue above, we need to try to
+ * increase the inflight counter for ourselves. And it is sufficient to
+ * guarantee that at least the first waiter to enter the waiting queue
+ * will re-check the waiting condition before going to sleep, thus
+ * ensuring forward progress.
+ */
+ if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) {
+ finish_wait(&rqw->wait, &data.wq);
+ /*
+ * We raced with rq_qos_wake_function() getting a token,
+ * which means we now have two. Put our local token
+ * and wake anyone else potentially waiting for one.
+ *
+ * Enough memory barrier in list_empty_careful() in
+ * finish_wait() is paired with list_del_init_careful()
+ * in rq_qos_wake_function() to make sure we will see
+ * the latest @data->got_token.
+ */
+ if (data.got_token)
+ cleanup_cb(rqw, private_data);
+ return;
+ }
+
+ /* we are now relying on the waker to increase our inflight counter. */
do {
- /* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
- if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
- finish_wait(&rqw->wait, &data.wq);
-
- /*
- * We raced with rq_qos_wake_function() getting a token,
- * which means we now have two. Put our local token
- * and wake anyone else potentially waiting for one.
- */
- smp_rmb();
- if (data.got_token)
- cleanup_cb(rqw, private_data);
- break;
- }
io_schedule();
- has_sleeper = true;
set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(&rqw->wait, &data.wq);
@@ -293,6 +319,7 @@ void rq_qos_exit(struct request_queue *q)
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
+ static_branch_dec(&block_rq_qos);
}
mutex_unlock(&q->rq_qos_mutex);
}
@@ -301,6 +328,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
const struct rq_qos_ops *ops)
{
struct request_queue *q = disk->queue;
+ unsigned int memflags;
lockdep_assert_held(&q->rq_qos_mutex);
@@ -312,14 +340,15 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
* No IO can be in-flight when adding rqos, so freeze queue, which
* is fine since we only support rq_qos for blk-mq queue.
*/
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
if (rq_qos_id(q, rqos->id))
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
+ static_branch_inc(&block_rq_qos);
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
if (rqos->ops->debugfs_attrs) {
mutex_lock(&q->debugfs_mutex);
@@ -329,7 +358,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
return 0;
ebusy:
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
return -EBUSY;
}
@@ -337,17 +366,18 @@ void rq_qos_del(struct rq_qos *rqos)
{
struct request_queue *q = rqos->disk->queue;
struct rq_qos **cur;
+ unsigned int memflags;
lockdep_assert_held(&q->rq_qos_mutex);
- blk_mq_freeze_queue(q);
+ memflags = blk_mq_freeze_queue(q);
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (*cur == rqos) {
*cur = rqos->next;
break;
}
}
- blk_mq_unfreeze_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_rqos(rqos);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..39749f4066fb 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -12,6 +12,7 @@
#include "blk-mq-debugfs.h"
struct blk_mq_debugfs_attr;
+extern struct static_key_false block_rq_qos;
enum rq_qos_id {
RQ_QOS_WBT,
@@ -112,31 +113,33 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_cleanup(q->rq_qos, bio);
}
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos && !blk_rq_is_passthrough(rq))
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos &&
+ !blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_issue(q->rq_qos, rq);
}
static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_requeue(q->rq_qos, rq);
}
static inline void rq_qos_done_bio(struct bio *bio)
{
- if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+ if (static_branch_unlikely(&block_rq_qos) &&
+ bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
bio_flagged(bio, BIO_QOS_MERGED))) {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (q->rq_qos)
@@ -146,7 +149,7 @@ static inline void rq_qos_done_bio(struct bio *bio)
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- if (q->rq_qos) {
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
@@ -155,14 +158,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_track(q->rq_qos, rq, bio);
}
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos) {
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
}
@@ -170,7 +173,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_queue_depth_changed(q->rq_qos);
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d2731843f2fc..a000daafbfb4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -6,7 +6,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/pagemap.h>
#include <linux/backing-dev-defs.h>
#include <linux/gcd.h>
@@ -21,7 +21,7 @@
void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
- q->rq_timeout = timeout;
+ WRITE_ONCE(q->rq_timeout, timeout);
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
@@ -50,25 +50,31 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
- lim->max_zone_append_sectors = UINT_MAX;
+ lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
-static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
+void blk_apply_bdi_limits(struct backing_dev_info *bdi,
struct queue_limits *lim)
{
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
+ *
+ * There is no hardware limitation for the read-ahead size and the user
+ * might have increased the read-ahead size through sysfs, so don't ever
+ * decrease it.
*/
- bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
+ bdi->ra_pages = max3(bdi->ra_pages,
+ lim->io_opt * 2 / PAGE_SIZE,
+ VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
static int blk_validate_zoned_limits(struct queue_limits *lim)
{
- if (!lim->zoned) {
+ if (!(lim->features & BLK_FEAT_ZONED)) {
if (WARN_ON_ONCE(lim->max_open_zones) ||
WARN_ON_ONCE(lim->max_active_zones) ||
WARN_ON_ONCE(lim->zone_write_granularity) ||
@@ -80,30 +86,175 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
return -EINVAL;
+ /*
+ * Given that active zones include open zones, the maximum number of
+ * open zones cannot be larger than the maximum number of active zones.
+ */
+ if (lim->max_active_zones &&
+ lim->max_open_zones > lim->max_active_zones)
+ return -EINVAL;
+
if (lim->zone_write_granularity < lim->logical_block_size)
lim->zone_write_granularity = lim->logical_block_size;
- if (lim->max_zone_append_sectors) {
+ /*
+ * The Zone Append size is limited by the maximum I/O size and the zone
+ * size given that it can't span zones.
+ *
+ * If no max_hw_zone_append_sectors limit is provided, the block layer
+ * will emulated it, else we're also bound by the hardware limit.
+ */
+ lim->max_zone_append_sectors =
+ min_not_zero(lim->max_hw_zone_append_sectors,
+ min(lim->chunk_sectors, lim->max_hw_sectors));
+ return 0;
+}
+
+static int blk_validate_integrity_limits(struct queue_limits *lim)
+{
+ struct blk_integrity *bi = &lim->integrity;
+
+ if (!bi->tuple_size) {
+ if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE ||
+ bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) {
+ pr_warn("invalid PI settings.\n");
+ return -EINVAL;
+ }
+ bi->flags |= BLK_INTEGRITY_NOGENERATE | BLK_INTEGRITY_NOVERIFY;
+ return 0;
+ }
+
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
+ pr_warn("integrity support disabled.\n");
+ return -EINVAL;
+ }
+
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE &&
+ (bi->flags & BLK_INTEGRITY_REF_TAG)) {
+ pr_warn("ref tag not support without checksum.\n");
+ return -EINVAL;
+ }
+
+ if (!bi->interval_exp)
+ bi->interval_exp = ilog2(lim->logical_block_size);
+
+ return 0;
+}
+
+/*
+ * Returns max guaranteed bytes which we can fit in a bio.
+ *
+ * We request that an atomic_write is ITER_UBUF iov_iter (so a single vector),
+ * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from
+ * the first and last segments.
+ */
+static unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim)
+{
+ unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments);
+ unsigned int length;
+
+ length = min(max_segments, 2) * lim->logical_block_size;
+ if (max_segments > 2)
+ length += (max_segments - 2) * PAGE_SIZE;
+
+ return length;
+}
+
+static void blk_atomic_writes_update_limits(struct queue_limits *lim)
+{
+ unsigned int unit_limit = min(lim->max_hw_sectors << SECTOR_SHIFT,
+ blk_queue_max_guaranteed_bio(lim));
+
+ unit_limit = rounddown_pow_of_two(unit_limit);
+
+ lim->atomic_write_max_sectors =
+ min(lim->atomic_write_hw_max >> SECTOR_SHIFT,
+ lim->max_hw_sectors);
+ lim->atomic_write_unit_min =
+ min(lim->atomic_write_hw_unit_min, unit_limit);
+ lim->atomic_write_unit_max =
+ min(lim->atomic_write_hw_unit_max, unit_limit);
+ lim->atomic_write_boundary_sectors =
+ lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
+}
+
+static void blk_validate_atomic_write_limits(struct queue_limits *lim)
+{
+ unsigned int boundary_sectors;
+
+ if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
+ goto unsupported;
+
+ if (!lim->atomic_write_hw_max)
+ goto unsupported;
+
+ if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min)))
+ goto unsupported;
+
+ if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max)))
+ goto unsupported;
+
+ if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min >
+ lim->atomic_write_hw_unit_max))
+ goto unsupported;
+
+ if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max >
+ lim->atomic_write_hw_max))
+ goto unsupported;
+
+ boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
+
+ if (boundary_sectors) {
+ if (WARN_ON_ONCE(lim->atomic_write_hw_max >
+ lim->atomic_write_hw_boundary))
+ goto unsupported;
/*
- * The Zone Append size is limited by the maximum I/O size
- * and the zone size given that it can't span zones.
+ * A feature of boundary support is that it disallows bios to
+ * be merged which would result in a merged request which
+ * crosses either a chunk sector or atomic write HW boundary,
+ * even though chunk sectors may be just set for performance.
+ * For simplicity, disallow atomic writes for a chunk sector
+ * which is non-zero and smaller than atomic write HW boundary.
+ * Furthermore, chunk sectors must be a multiple of atomic
+ * write HW boundary. Otherwise boundary support becomes
+ * complicated.
+ * Devices which do not conform to these rules can be dealt
+ * with if and when they show up.
*/
- lim->max_zone_append_sectors =
- min3(lim->max_hw_sectors,
- lim->max_zone_append_sectors,
- lim->chunk_sectors);
+ if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors))
+ goto unsupported;
+
+ /*
+ * The boundary size just needs to be a multiple of unit_max
+ * (and not necessarily a power-of-2), so this following check
+ * could be relaxed in future.
+ * Furthermore, if needed, unit_max could even be reduced so
+ * that it is compliant with a !power-of-2 boundary.
+ */
+ if (!is_power_of_2(boundary_sectors))
+ goto unsupported;
}
- return 0;
+ blk_atomic_writes_update_limits(lim);
+ return;
+
+unsupported:
+ lim->atomic_write_max_sectors = 0;
+ lim->atomic_write_boundary_sectors = 0;
+ lim->atomic_write_unit_min = 0;
+ lim->atomic_write_unit_max = 0;
}
/*
* Check that the limits in lim are valid, initialize defaults for unset
* values, and cap values based on others where needed.
*/
-static int blk_validate_limits(struct queue_limits *lim)
+int blk_validate_limits(struct queue_limits *lim)
{
unsigned int max_hw_sectors;
+ unsigned int logical_block_sectors;
+ unsigned long seg_size;
+ int err;
/*
* Unless otherwise specified, default to 512 byte logical blocks and a
@@ -111,6 +262,10 @@ static int blk_validate_limits(struct queue_limits *lim)
*/
if (!lim->logical_block_size)
lim->logical_block_size = SECTOR_SIZE;
+ else if (blk_validate_block_size(lim->logical_block_size)) {
+ pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size);
+ return -EINVAL;
+ }
if (lim->physical_block_size < lim->logical_block_size)
lim->physical_block_size = lim->logical_block_size;
@@ -122,6 +277,13 @@ static int blk_validate_limits(struct queue_limits *lim)
lim->io_min = lim->physical_block_size;
/*
+ * The optimal I/O size may not be aligned to physical block size
+ * (because it may be limited by dma engines which have no clue about
+ * block size of the disks attached to them), so we round it down here.
+ */
+ lim->io_opt = round_down(lim->io_opt, lim->physical_block_size);
+
+ /*
* max_hw_sectors has a somewhat weird default for historical reason,
* but driver really should set their own instead of relying on this
* value.
@@ -134,8 +296,11 @@ static int blk_validate_limits(struct queue_limits *lim)
lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS))
return -EINVAL;
+ logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT;
+ if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors))
+ return -EINVAL;
lim->max_hw_sectors = round_down(lim->max_hw_sectors,
- lim->logical_block_size >> SECTOR_SHIFT);
+ logical_block_sectors);
/*
* The actual max_sectors value is a complex beast and also takes the
@@ -146,14 +311,20 @@ static int blk_validate_limits(struct queue_limits *lim)
max_hw_sectors = min_not_zero(lim->max_hw_sectors,
lim->max_dev_sectors);
if (lim->max_user_sectors) {
- if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
+ if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE)
return -EINVAL;
lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
+ } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) {
+ lim->max_sectors =
+ min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT);
+ } else if (lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) {
+ lim->max_sectors =
+ min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT);
} else {
lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
}
lim->max_sectors = round_down(lim->max_sectors,
- lim->logical_block_size >> SECTOR_SHIFT);
+ logical_block_sectors);
/*
* Random default for the maximum number of segments. Driver should not
@@ -178,7 +349,7 @@ static int blk_validate_limits(struct queue_limits *lim)
*/
if (!lim->seg_boundary_mask)
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
- if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
+ if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1))
return -EINVAL;
/*
@@ -188,7 +359,10 @@ static int blk_validate_limits(struct queue_limits *lim)
* bvec and lower layer bio splitting is supposed to handle the two
* correctly.
*/
- if (!lim->virt_boundary_mask) {
+ if (lim->virt_boundary_mask) {
+ if (!lim->max_segment_size)
+ lim->max_segment_size = UINT_MAX;
+ } else {
/*
* The maximum segment size has an odd historic 64k default that
* drivers probably should override. Just like the I/O size we
@@ -196,10 +370,17 @@ static int blk_validate_limits(struct queue_limits *lim)
*/
if (!lim->max_segment_size)
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
- if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
+ if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE))
return -EINVAL;
}
+ /* setup min segment size for building new segment in fast path */
+ if (lim->seg_boundary_mask > lim->max_segment_size - 1)
+ seg_size = lim->max_segment_size;
+ else
+ seg_size = lim->seg_boundary_mask + 1;
+ lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE);
+
/*
* We require drivers to at least do logical block aligned I/O, but
* historically could not check for that due to the separate calls
@@ -213,11 +394,20 @@ static int blk_validate_limits(struct queue_limits *lim)
if (lim->alignment_offset) {
lim->alignment_offset &= (lim->physical_block_size - 1);
- lim->misaligned = 0;
+ lim->flags &= ~BLK_FLAG_MISALIGNED;
}
+ if (!(lim->features & BLK_FEAT_WRITE_CACHE))
+ lim->features &= ~BLK_FEAT_FUA;
+
+ blk_validate_atomic_write_limits(lim);
+
+ err = blk_validate_integrity_limits(lim);
+ if (err)
+ return err;
return blk_validate_zoned_limits(lim);
}
+EXPORT_SYMBOL_GPL(blk_validate_limits);
/*
* Set the default limits for a newly allocated queue. @lim contains the
@@ -241,27 +431,63 @@ int blk_set_default_limits(struct queue_limits *lim)
* @lim: limits to apply
*
* Apply the limits in @lim that were obtained from queue_limits_start_update()
- * and updated by the caller to @q.
+ * and updated by the caller to @q. The caller must have frozen the queue or
+ * ensure that there are no outstanding I/Os by other means.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_commit_update(struct request_queue *q,
struct queue_limits *lim)
- __releases(q->limits_lock)
{
- int error = blk_validate_limits(lim);
+ int error;
+
+ error = blk_validate_limits(lim);
+ if (error)
+ goto out_unlock;
- if (!error) {
- q->limits = *lim;
- if (q->disk)
- blk_apply_bdi_limits(q->disk->bdi, lim);
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ if (q->crypto_profile && lim->integrity.tag_size) {
+ pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together.\n");
+ error = -EINVAL;
+ goto out_unlock;
}
+#endif
+
+ q->limits = *lim;
+ if (q->disk)
+ blk_apply_bdi_limits(q->disk->bdi, lim);
+out_unlock:
mutex_unlock(&q->limits_lock);
return error;
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
/**
+ * queue_limits_commit_update_frozen - commit an atomic update of queue limits
+ * @q: queue to update
+ * @lim: limits to apply
+ *
+ * Apply the limits in @lim that were obtained from queue_limits_start_update()
+ * and updated with the new values by the caller to @q. Freezes the queue
+ * before the update and unfreezes it after.
+ *
+ * Returns 0 if successful, else a negative error code.
+ */
+int queue_limits_commit_update_frozen(struct request_queue *q,
+ struct queue_limits *lim)
+{
+ unsigned int memflags;
+ int ret;
+
+ memflags = blk_mq_freeze_queue(q);
+ ret = queue_limits_commit_update(q, lim);
+ blk_mq_unfreeze_queue(q, memflags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen);
+
+/**
* queue_limits_set - apply queue limits to queue
* @q: queue to update
* @lim: limits to apply
@@ -279,446 +505,159 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
}
EXPORT_SYMBOL_GPL(queue_limits_set);
-/**
- * blk_queue_bounce_limit - set bounce buffer limit for queue
- * @q: the request queue for the device
- * @bounce: bounce limit to enforce
- *
- * Description:
- * Force bouncing for ISA DMA ranges or highmem.
- *
- * DEPRECATED, don't use in new code.
- **/
-void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce)
+static int queue_limit_alignment_offset(const struct queue_limits *lim,
+ sector_t sector)
{
- q->limits.bounce = bounce;
+ unsigned int granularity = max(lim->physical_block_size, lim->io_min);
+ unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT)
+ << SECTOR_SHIFT;
+
+ return (granularity + lim->alignment_offset - alignment) % granularity;
}
-EXPORT_SYMBOL(blk_queue_bounce_limit);
-/**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
- * @q: the request queue for the device
- * @max_hw_sectors: max hardware sectors in the usual 512b unit
- *
- * Description:
- * Enables a low level driver to set a hard upper limit,
- * max_hw_sectors, on the size of requests. max_hw_sectors is set by
- * the device driver based upon the capabilities of the I/O
- * controller.
- *
- * max_dev_sectors is a hard limit imposed by the storage device for
- * READ/WRITE requests. It is set by the disk driver.
- *
- * max_sectors is a soft limit imposed by the block layer for
- * filesystem type requests. This value can be overridden on a
- * per-device basis in /sys/block/<device>/queue/max_sectors_kb.
- * The soft limit can not exceed max_hw_sectors.
- **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+static unsigned int queue_limit_discard_alignment(
+ const struct queue_limits *lim, sector_t sector)
{
- struct queue_limits *limits = &q->limits;
- unsigned int max_sectors;
-
- if ((max_hw_sectors << 9) < PAGE_SIZE) {
- max_hw_sectors = 1 << (PAGE_SHIFT - 9);
- pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
- }
+ unsigned int alignment, granularity, offset;
- max_hw_sectors = round_down(max_hw_sectors,
- limits->logical_block_size >> SECTOR_SHIFT);
- limits->max_hw_sectors = max_hw_sectors;
+ if (!lim->max_discard_sectors)
+ return 0;
- max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
+ /* Why are these in bytes, not sectors? */
+ alignment = lim->discard_alignment >> SECTOR_SHIFT;
+ granularity = lim->discard_granularity >> SECTOR_SHIFT;
- if (limits->max_user_sectors)
- max_sectors = min(max_sectors, limits->max_user_sectors);
- else
- max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP);
+ /* Offset of the partition start in 'granularity' sectors */
+ offset = sector_div(sector, granularity);
- max_sectors = round_down(max_sectors,
- limits->logical_block_size >> SECTOR_SHIFT);
- limits->max_sectors = max_sectors;
+ /* And why do we do this modulus *again* in blkdev_issue_discard()? */
+ offset = (granularity + alignment - offset) % granularity;
- if (!q->disk)
- return;
- q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9);
+ /* Turn it back into bytes, gaah */
+ return offset << SECTOR_SHIFT;
}
-EXPORT_SYMBOL(blk_queue_max_hw_sectors);
-/**
- * blk_queue_chunk_sectors - set size of the chunk for this queue
- * @q: the request queue for the device
- * @chunk_sectors: chunk sectors in the usual 512b unit
- *
- * Description:
- * If a driver doesn't want IOs to cross a given chunk size, it can set
- * this limit and prevent merging across chunks. Note that the block layer
- * must accept a page worth of data at any offset. So if the crossing of
- * chunks is a hard limitation in the driver, it must still be prepared
- * to split single page bios.
- **/
-void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
+static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs)
{
- q->limits.chunk_sectors = chunk_sectors;
+ sectors = round_down(sectors, lbs >> SECTOR_SHIFT);
+ if (sectors < PAGE_SIZE >> SECTOR_SHIFT)
+ sectors = PAGE_SIZE >> SECTOR_SHIFT;
+ return sectors;
}
-EXPORT_SYMBOL(blk_queue_chunk_sectors);
-/**
- * blk_queue_max_discard_sectors - set max sectors for a single discard
- * @q: the request queue for the device
- * @max_discard_sectors: maximum number of sectors to discard
- **/
-void blk_queue_max_discard_sectors(struct request_queue *q,
- unsigned int max_discard_sectors)
+/* Check if second and later bottom devices are compliant */
+static bool blk_stack_atomic_writes_tail(struct queue_limits *t,
+ struct queue_limits *b)
{
- struct queue_limits *lim = &q->limits;
+ /* We're not going to support different boundary sizes.. yet */
+ if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary)
+ return false;
- lim->max_hw_discard_sectors = max_discard_sectors;
- lim->max_discard_sectors =
- min(max_discard_sectors, lim->max_user_discard_sectors);
-}
-EXPORT_SYMBOL(blk_queue_max_discard_sectors);
+ /* Can't support this */
+ if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max)
+ return false;
-/**
- * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase
- * @q: the request queue for the device
- * @max_sectors: maximum number of sectors to secure_erase
- **/
-void blk_queue_max_secure_erase_sectors(struct request_queue *q,
- unsigned int max_sectors)
-{
- q->limits.max_secure_erase_sectors = max_sectors;
-}
-EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors);
+ /* Or this */
+ if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min)
+ return false;
-/**
- * blk_queue_max_write_zeroes_sectors - set max sectors for a single
- * write zeroes
- * @q: the request queue for the device
- * @max_write_zeroes_sectors: maximum number of sectors to write per command
- **/
-void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
- unsigned int max_write_zeroes_sectors)
-{
- q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+ t->atomic_write_hw_max = min(t->atomic_write_hw_max,
+ b->atomic_write_hw_max);
+ t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min,
+ b->atomic_write_hw_unit_min);
+ t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
+ b->atomic_write_hw_unit_max);
+ return true;
}
-EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
-/**
- * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
- * @q: the request queue for the device
- * @max_zone_append_sectors: maximum number of sectors to write per command
- **/
-void blk_queue_max_zone_append_sectors(struct request_queue *q,
- unsigned int max_zone_append_sectors)
+/* Check for valid boundary of first bottom device */
+static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
+ struct queue_limits *b)
{
- unsigned int max_sectors;
-
- if (WARN_ON(!blk_queue_is_zoned(q)))
- return;
-
- max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
- max_sectors = min(q->limits.chunk_sectors, max_sectors);
-
/*
- * Signal eventual driver bugs resulting in the max_zone_append sectors limit
- * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
- * or the max_hw_sectors limit not set.
+ * Ensure atomic write boundary is aligned with chunk sectors. Stacked
+ * devices store chunk sectors in t->io_min.
*/
- WARN_ON(!max_sectors);
+ if (b->atomic_write_hw_boundary > t->io_min &&
+ b->atomic_write_hw_boundary % t->io_min)
+ return false;
+ if (t->io_min > b->atomic_write_hw_boundary &&
+ t->io_min % b->atomic_write_hw_boundary)
+ return false;
- q->limits.max_zone_append_sectors = max_sectors;
+ t->atomic_write_hw_boundary = b->atomic_write_hw_boundary;
+ return true;
}
-EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
-/**
- * blk_queue_max_segments - set max hw segments for a request for this queue
- * @q: the request queue for the device
- * @max_segments: max number of segments
- *
- * Description:
- * Enables a low level driver to set an upper limit on the number of
- * hw data segments in a request.
- **/
-void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
-{
- if (!max_segments) {
- max_segments = 1;
- pr_info("%s: set to minimum %u\n", __func__, max_segments);
- }
-
- q->limits.max_segments = max_segments;
-}
-EXPORT_SYMBOL(blk_queue_max_segments);
-/**
- * blk_queue_max_discard_segments - set max segments for discard requests
- * @q: the request queue for the device
- * @max_segments: max number of segments
- *
- * Description:
- * Enables a low level driver to set an upper limit on the number of
- * segments in a discard request.
- **/
-void blk_queue_max_discard_segments(struct request_queue *q,
- unsigned short max_segments)
+/* Check stacking of first bottom device */
+static bool blk_stack_atomic_writes_head(struct queue_limits *t,
+ struct queue_limits *b)
{
- q->limits.max_discard_segments = max_segments;
-}
-EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments);
+ if (b->atomic_write_hw_boundary &&
+ !blk_stack_atomic_writes_boundary_head(t, b))
+ return false;
-/**
- * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
- * @q: the request queue for the device
- * @max_size: max size of segment in bytes
- *
- * Description:
- * Enables a low level driver to set an upper limit on the size of a
- * coalesced segment
- **/
-void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
-{
- if (max_size < PAGE_SIZE) {
- max_size = PAGE_SIZE;
- pr_info("%s: set to minimum %u\n", __func__, max_size);
+ if (t->io_min <= SECTOR_SIZE) {
+ /* No chunk sectors, so use bottom device values directly */
+ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+ t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
+ t->atomic_write_hw_max = b->atomic_write_hw_max;
+ return true;
}
- /* see blk_queue_virt_boundary() for the explanation */
- WARN_ON_ONCE(q->limits.virt_boundary_mask);
-
- q->limits.max_segment_size = max_size;
-}
-EXPORT_SYMBOL(blk_queue_max_segment_size);
-
-/**
- * blk_queue_logical_block_size - set logical block size for the queue
- * @q: the request queue for the device
- * @size: the logical block size, in bytes
- *
- * Description:
- * This should be set to the lowest possible block size that the
- * storage device can address. The default of 512 covers most
- * hardware.
- **/
-void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
-{
- struct queue_limits *limits = &q->limits;
-
- limits->logical_block_size = size;
-
- if (limits->discard_granularity < limits->logical_block_size)
- limits->discard_granularity = limits->logical_block_size;
-
- if (limits->physical_block_size < size)
- limits->physical_block_size = size;
-
- if (limits->io_min < limits->physical_block_size)
- limits->io_min = limits->physical_block_size;
-
- limits->max_hw_sectors =
- round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT);
- limits->max_sectors =
- round_down(limits->max_sectors, size >> SECTOR_SHIFT);
-}
-EXPORT_SYMBOL(blk_queue_logical_block_size);
-
-/**
- * blk_queue_physical_block_size - set physical block size for the queue
- * @q: the request queue for the device
- * @size: the physical block size, in bytes
- *
- * Description:
- * This should be set to the lowest possible sector size that the
- * hardware can operate on without reverting to read-modify-write
- * operations.
- */
-void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
-{
- q->limits.physical_block_size = size;
-
- if (q->limits.physical_block_size < q->limits.logical_block_size)
- q->limits.physical_block_size = q->limits.logical_block_size;
-
- if (q->limits.discard_granularity < q->limits.physical_block_size)
- q->limits.discard_granularity = q->limits.physical_block_size;
-
- if (q->limits.io_min < q->limits.physical_block_size)
- q->limits.io_min = q->limits.physical_block_size;
-}
-EXPORT_SYMBOL(blk_queue_physical_block_size);
-
-/**
- * blk_queue_zone_write_granularity - set zone write granularity for the queue
- * @q: the request queue for the zoned device
- * @size: the zone write granularity size, in bytes
- *
- * Description:
- * This should be set to the lowest possible size allowing to write in
- * sequential zones of a zoned block device.
- */
-void blk_queue_zone_write_granularity(struct request_queue *q,
- unsigned int size)
-{
- if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
- return;
-
- q->limits.zone_write_granularity = size;
-
- if (q->limits.zone_write_granularity < q->limits.logical_block_size)
- q->limits.zone_write_granularity = q->limits.logical_block_size;
-}
-EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
+ /*
+ * Find values for limits which work for chunk size.
+ * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
+ * size (t->io_min), as chunk size is not restricted to a power-of-2.
+ * So we need to find highest power-of-2 which works for the chunk
+ * size.
+ * As an example scenario, we could have b->unit_max = 16K and
+ * t->io_min = 24K. For this case, reduce t->unit_max to a value
+ * aligned with both limits, i.e. 8K in this example.
+ */
+ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+ while (t->io_min % t->atomic_write_hw_unit_max)
+ t->atomic_write_hw_unit_max /= 2;
-/**
- * blk_queue_alignment_offset - set physical block alignment offset
- * @q: the request queue for the device
- * @offset: alignment offset in bytes
- *
- * Description:
- * Some devices are naturally misaligned to compensate for things like
- * the legacy DOS partition table 63-sector offset. Low-level drivers
- * should call this function for devices whose first sector is not
- * naturally aligned.
- */
-void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
-{
- q->limits.alignment_offset =
- offset & (q->limits.physical_block_size - 1);
- q->limits.misaligned = 0;
-}
-EXPORT_SYMBOL(blk_queue_alignment_offset);
+ t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min,
+ t->atomic_write_hw_unit_max);
+ t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min);
-void disk_update_readahead(struct gendisk *disk)
-{
- blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
+ return true;
}
-EXPORT_SYMBOL_GPL(disk_update_readahead);
-/**
- * blk_limits_io_min - set minimum request size for a device
- * @limits: the queue limits
- * @min: smallest I/O size in bytes
- *
- * Description:
- * Some devices have an internal block size bigger than the reported
- * hardware sector size. This function can be used to signal the
- * smallest I/O the device can perform without incurring a performance
- * penalty.
- */
-void blk_limits_io_min(struct queue_limits *limits, unsigned int min)
+static void blk_stack_atomic_writes_limits(struct queue_limits *t,
+ struct queue_limits *b, sector_t start)
{
- limits->io_min = min;
+ if (!(b->features & BLK_FEAT_ATOMIC_WRITES))
+ goto unsupported;
- if (limits->io_min < limits->logical_block_size)
- limits->io_min = limits->logical_block_size;
+ if (!b->atomic_write_hw_unit_min)
+ goto unsupported;
- if (limits->io_min < limits->physical_block_size)
- limits->io_min = limits->physical_block_size;
-}
-EXPORT_SYMBOL(blk_limits_io_min);
+ if (!blk_atomic_write_start_sect_aligned(start, b))
+ goto unsupported;
-/**
- * blk_queue_io_min - set minimum request size for the queue
- * @q: the request queue for the device
- * @min: smallest I/O size in bytes
- *
- * Description:
- * Storage devices may report a granularity or preferred minimum I/O
- * size which is the smallest request the device can perform without
- * incurring a performance penalty. For disk drives this is often the
- * physical block size. For RAID arrays it is often the stripe chunk
- * size. A properly aligned multiple of minimum_io_size is the
- * preferred request size for workloads where a high number of I/O
- * operations is desired.
- */
-void blk_queue_io_min(struct request_queue *q, unsigned int min)
-{
- blk_limits_io_min(&q->limits, min);
-}
-EXPORT_SYMBOL(blk_queue_io_min);
-
-/**
- * blk_limits_io_opt - set optimal request size for a device
- * @limits: the queue limits
- * @opt: smallest I/O size in bytes
- *
- * Description:
- * Storage devices may report an optimal I/O size, which is the
- * device's preferred unit for sustained I/O. This is rarely reported
- * for disk drives. For RAID arrays it is usually the stripe width or
- * the internal track size. A properly aligned multiple of
- * optimal_io_size is the preferred request size for workloads where
- * sustained throughput is desired.
- */
-void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
-{
- limits->io_opt = opt;
-}
-EXPORT_SYMBOL(blk_limits_io_opt);
-
-/**
- * blk_queue_io_opt - set optimal request size for the queue
- * @q: the request queue for the device
- * @opt: optimal request size in bytes
- *
- * Description:
- * Storage devices may report an optimal I/O size, which is the
- * device's preferred unit for sustained I/O. This is rarely reported
- * for disk drives. For RAID arrays it is usually the stripe width or
- * the internal track size. A properly aligned multiple of
- * optimal_io_size is the preferred request size for workloads where
- * sustained throughput is desired.
- */
-void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
-{
- blk_limits_io_opt(&q->limits, opt);
- if (!q->disk)
+ /*
+ * If atomic_write_hw_max is set, we have already stacked 1x bottom
+ * device, so check for compliance.
+ */
+ if (t->atomic_write_hw_max) {
+ if (!blk_stack_atomic_writes_tail(t, b))
+ goto unsupported;
return;
- q->disk->bdi->ra_pages =
- max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
-}
-EXPORT_SYMBOL(blk_queue_io_opt);
-
-static int queue_limit_alignment_offset(const struct queue_limits *lim,
- sector_t sector)
-{
- unsigned int granularity = max(lim->physical_block_size, lim->io_min);
- unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT)
- << SECTOR_SHIFT;
-
- return (granularity + lim->alignment_offset - alignment) % granularity;
-}
-
-static unsigned int queue_limit_discard_alignment(
- const struct queue_limits *lim, sector_t sector)
-{
- unsigned int alignment, granularity, offset;
-
- if (!lim->max_discard_sectors)
- return 0;
-
- /* Why are these in bytes, not sectors? */
- alignment = lim->discard_alignment >> SECTOR_SHIFT;
- granularity = lim->discard_granularity >> SECTOR_SHIFT;
- if (!granularity)
- return 0;
-
- /* Offset of the partition start in 'granularity' sectors */
- offset = sector_div(sector, granularity);
-
- /* And why do we do this modulus *again* in blkdev_issue_discard()? */
- offset = (granularity + alignment - offset) % granularity;
+ }
- /* Turn it back into bytes, gaah */
- return offset << SECTOR_SHIFT;
-}
+ if (!blk_stack_atomic_writes_head(t, b))
+ goto unsupported;
+ return;
-static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs)
-{
- sectors = round_down(sectors, lbs >> SECTOR_SHIFT);
- if (sectors < PAGE_SIZE >> SECTOR_SHIFT)
- sectors = PAGE_SIZE >> SECTOR_SHIFT;
- return sectors;
+unsupported:
+ t->atomic_write_hw_max = 0;
+ t->atomic_write_hw_unit_max = 0;
+ t->atomic_write_hw_unit_min = 0;
+ t->atomic_write_hw_boundary = 0;
}
/**
@@ -747,14 +686,30 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
{
unsigned int top, bottom, alignment, ret = 0;
+ t->features |= (b->features & BLK_FEAT_INHERIT_MASK);
+
+ /*
+ * Some feaures need to be supported both by the stacking driver and all
+ * underlying devices. The stacking driver sets these flags before
+ * stacking the limits, and this will clear the flags if any of the
+ * underlying devices does not support it.
+ */
+ if (!(b->features & BLK_FEAT_NOWAIT))
+ t->features &= ~BLK_FEAT_NOWAIT;
+ if (!(b->features & BLK_FEAT_POLL))
+ t->features &= ~BLK_FEAT_POLL;
+
+ t->flags |= (b->flags & BLK_FLAG_MISALIGNED);
+
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+ t->max_user_sectors = min_not_zero(t->max_user_sectors,
+ b->max_user_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
- t->max_zone_append_sectors = min(t->max_zone_append_sectors,
- b->max_zone_append_sectors);
- t->bounce = max(t->bounce, b->bounce);
+ t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
+ b->max_hw_zone_append_sectors);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
@@ -770,8 +725,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
- t->misaligned |= b->misaligned;
-
alignment = queue_limit_alignment_offset(b, start);
/* Bottom device has different alignment. Check that it is
@@ -785,7 +738,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
/* Verify that top and bottom intervals line up */
if (max(top, bottom) % min(top, bottom)) {
- t->misaligned = 1;
+ t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
}
}
@@ -807,42 +760,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
t->physical_block_size = t->logical_block_size;
- t->misaligned = 1;
+ t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
}
/* Minimum I/O a multiple of the physical block size? */
if (t->io_min & (t->physical_block_size - 1)) {
t->io_min = t->physical_block_size;
- t->misaligned = 1;
+ t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
}
/* Optimal I/O a multiple of the physical block size? */
if (t->io_opt & (t->physical_block_size - 1)) {
t->io_opt = 0;
- t->misaligned = 1;
+ t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
}
/* chunk_sectors a multiple of the physical block size? */
if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
t->chunk_sectors = 0;
- t->misaligned = 1;
+ t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
}
- t->raid_partial_stripes_expensive =
- max(t->raid_partial_stripes_expensive,
- b->raid_partial_stripes_expensive);
-
/* Find lowest common alignment_offset */
t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment)
% max(t->physical_block_size, t->io_min);
/* Verify that new alignment_offset is on a logical block boundary */
if (t->alignment_offset & (t->logical_block_size - 1)) {
- t->misaligned = 1;
+ t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
}
@@ -854,16 +803,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
if (b->discard_granularity) {
alignment = queue_limit_discard_alignment(b, start);
- if (t->discard_granularity != 0 &&
- t->discard_alignment != alignment) {
- top = t->discard_granularity + t->discard_alignment;
- bottom = b->discard_granularity + alignment;
-
- /* Verify that top and bottom intervals line up */
- if ((max(top, bottom) % min(top, bottom)) != 0)
- t->discard_misaligned = 1;
- }
-
t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
b->max_discard_sectors);
t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors,
@@ -877,11 +816,12 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
b->max_secure_erase_sectors);
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
- t->zoned = max(t->zoned, b->zoned);
- if (!t->zoned) {
+ if (!(t->features & BLK_FEAT_ZONED)) {
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
+ blk_stack_atomic_writes_limits(t, b, start);
+
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
@@ -904,7 +844,7 @@ EXPORT_SYMBOL(blk_stack_limits);
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
sector_t offset, const char *pfx)
{
- if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits,
+ if (blk_stack_limits(t, bdev_limits(bdev),
get_start_sect(bdev) + offset))
pr_notice("%s: Warning: Device %pg is misaligned\n",
pfx, bdev);
@@ -912,96 +852,57 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
EXPORT_SYMBOL_GPL(queue_limits_stack_bdev);
/**
- * blk_queue_update_dma_pad - update pad mask
- * @q: the request queue for the device
- * @mask: pad mask
- *
- * Update dma pad mask.
- *
- * Appending pad buffer to a request modifies the last entry of a
- * scatter list such that it includes the pad buffer.
- **/
-void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
-{
- if (mask > q->dma_pad_mask)
- q->dma_pad_mask = mask;
-}
-EXPORT_SYMBOL(blk_queue_update_dma_pad);
-
-/**
- * blk_queue_segment_boundary - set boundary rules for segment merging
- * @q: the request queue for the device
- * @mask: the memory boundary mask
- **/
-void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
-{
- if (mask < PAGE_SIZE - 1) {
- mask = PAGE_SIZE - 1;
- pr_info("%s: set to minimum %lx\n", __func__, mask);
- }
-
- q->limits.seg_boundary_mask = mask;
-}
-EXPORT_SYMBOL(blk_queue_segment_boundary);
-
-/**
- * blk_queue_virt_boundary - set boundary rules for bio merging
- * @q: the request queue for the device
- * @mask: the memory boundary mask
- **/
-void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
-{
- q->limits.virt_boundary_mask = mask;
-
- /*
- * Devices that require a virtual boundary do not support scatter/gather
- * I/O natively, but instead require a descriptor list entry for each
- * page (which might not be idential to the Linux PAGE_SIZE). Because
- * of that they are not limited by our notion of "segment size".
- */
- if (mask)
- q->limits.max_segment_size = UINT_MAX;
-}
-EXPORT_SYMBOL(blk_queue_virt_boundary);
-
-/**
- * blk_queue_dma_alignment - set dma length and memory alignment
- * @q: the request queue for the device
- * @mask: alignment mask
+ * queue_limits_stack_integrity - stack integrity profile
+ * @t: target queue limits
+ * @b: base queue limits
*
- * description:
- * set required memory and length alignment for direct dma transactions.
- * this is used when building direct io requests for the queue.
+ * Check if the integrity profile in the @b can be stacked into the
+ * target @t. Stacking is possible if either:
*
- **/
-void blk_queue_dma_alignment(struct request_queue *q, int mask)
-{
- q->limits.dma_alignment = mask;
-}
-EXPORT_SYMBOL(blk_queue_dma_alignment);
-
-/**
- * blk_queue_update_dma_alignment - update dma length and memory alignment
- * @q: the request queue for the device
- * @mask: alignment mask
+ * a) does not have any integrity information stacked into it yet
+ * b) the integrity profile in @b is identical to the one in @t
*
- * description:
- * update required memory and length alignment for direct dma transactions.
- * If the requested alignment is larger than the current alignment, then
- * the current queue alignment is updated to the new value, otherwise it
- * is left alone. The design of this is to allow multiple objects
- * (driver, device, transport etc) to set their respective
- * alignments without having them interfere.
- *
- **/
-void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
-{
- BUG_ON(mask > PAGE_SIZE);
+ * If @b can be stacked into @t, return %true. Else return %false and clear the
+ * integrity information in @t.
+ */
+bool queue_limits_stack_integrity(struct queue_limits *t,
+ struct queue_limits *b)
+{
+ struct blk_integrity *ti = &t->integrity;
+ struct blk_integrity *bi = &b->integrity;
+
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY))
+ return true;
+
+ if (ti->flags & BLK_INTEGRITY_STACKED) {
+ if (ti->tuple_size != bi->tuple_size)
+ goto incompatible;
+ if (ti->interval_exp != bi->interval_exp)
+ goto incompatible;
+ if (ti->tag_size != bi->tag_size)
+ goto incompatible;
+ if (ti->csum_type != bi->csum_type)
+ goto incompatible;
+ if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
+ (bi->flags & BLK_INTEGRITY_REF_TAG))
+ goto incompatible;
+ } else {
+ ti->flags = BLK_INTEGRITY_STACKED;
+ ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
+ (bi->flags & BLK_INTEGRITY_REF_TAG);
+ ti->csum_type = bi->csum_type;
+ ti->tuple_size = bi->tuple_size;
+ ti->pi_offset = bi->pi_offset;
+ ti->interval_exp = bi->interval_exp;
+ ti->tag_size = bi->tag_size;
+ }
+ return true;
- if (mask > q->limits.dma_alignment)
- q->limits.dma_alignment = mask;
+incompatible:
+ memset(ti, 0, sizeof(*ti));
+ return false;
}
-EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+EXPORT_SYMBOL_GPL(queue_limits_stack_integrity);
/**
* blk_set_queue_depth - tell the block layer about the device queue depth
@@ -1016,92 +917,11 @@ void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
}
EXPORT_SYMBOL(blk_set_queue_depth);
-/**
- * blk_queue_write_cache - configure queue's write cache
- * @q: the request queue for the device
- * @wc: write back cache on or off
- * @fua: device supports FUA writes, if true
- *
- * Tell the block layer about the write cache of @q.
- */
-void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
-{
- if (wc) {
- blk_queue_flag_set(QUEUE_FLAG_HW_WC, q);
- blk_queue_flag_set(QUEUE_FLAG_WC, q);
- } else {
- blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q);
- blk_queue_flag_clear(QUEUE_FLAG_WC, q);
- }
- if (fua)
- blk_queue_flag_set(QUEUE_FLAG_FUA, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
-}
-EXPORT_SYMBOL_GPL(blk_queue_write_cache);
-
-/**
- * blk_queue_required_elevator_features - Set a queue required elevator features
- * @q: the request queue for the target device
- * @features: Required elevator features OR'ed together
- *
- * Tell the block layer that for the device controlled through @q, only the
- * only elevators that can be used are those that implement at least the set of
- * features specified by @features.
- */
-void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features)
-{
- q->required_elevator_features = features;
-}
-EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
-
-/**
- * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
- * @q: the request queue for the device
- * @dev: the device pointer for dma
- *
- * Tell the block layer about merging the segments by dma map of @q.
- */
-bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
- struct device *dev)
-{
- unsigned long boundary = dma_get_merge_boundary(dev);
-
- if (!boundary)
- return false;
-
- /* No need to update max_segment_size. see blk_queue_virt_boundary() */
- blk_queue_virt_boundary(q, boundary);
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
-
-/**
- * disk_set_zoned - inidicate a zoned device
- * @disk: gendisk to configure
- */
-void disk_set_zoned(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
-
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
-
- /*
- * Set the zone write granularity to the device logical block
- * size by default. The driver can change this value if needed.
- */
- q->limits.zoned = true;
- blk_queue_zone_write_granularity(q, queue_logical_block_size(q));
-}
-EXPORT_SYMBOL_GPL(disk_set_zoned);
-
int bdev_alignment_offset(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
- if (q->limits.misaligned)
+ if (q->limits.flags & BLK_FLAG_MISALIGNED)
return -1;
if (bdev_is_partition(bdev))
return queue_limit_alignment_offset(&q->limits,
diff --git a/block/blk-stat.c b/block/blk-stat.c
index e42c263e53fb..46449da856f8 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now)
value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
- if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE)
- blk_throtl_stat_add(rq, value);
-
rcu_read_lock();
cpu = get_cpu();
list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
@@ -165,7 +162,7 @@ void blk_stat_remove_callback(struct request_queue *q,
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
- del_timer_sync(&cb->timer);
+ timer_delete_sync(&cb->timer);
}
static void blk_stat_free_callback_rcu(struct rcu_head *head)
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 17e1eb4ec7e2..9e05bf18d1be 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -64,7 +64,6 @@ struct blk_stat_callback {
struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *);
-bool blk_stats_alloc_enable(struct request_queue *q);
void blk_stat_add(struct request *rq, u64 now);
@@ -149,7 +148,7 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
static inline void blk_stat_deactivate(struct blk_stat_callback *cb)
{
- del_timer_sync(&cb->timer);
+ timer_delete_sync(&cb->timer);
}
/**
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8c8f69d8ba48..b2b9b89d6967 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -22,14 +22,18 @@
struct queue_sysfs_entry {
struct attribute attr;
- ssize_t (*show)(struct request_queue *, char *);
- ssize_t (*store)(struct request_queue *, const char *, size_t);
+ ssize_t (*show)(struct gendisk *disk, char *page);
+ ssize_t (*show_limit)(struct gendisk *disk, char *page);
+
+ ssize_t (*store)(struct gendisk *disk, const char *page, size_t count);
+ int (*store_limit)(struct gendisk *disk, const char *page,
+ size_t count, struct queue_limits *lim);
};
static ssize_t
queue_var_show(unsigned long var, char *page)
{
- return sprintf(page, "%lu\n", var);
+ return sysfs_emit(page, "%lu\n", var);
}
static ssize_t
@@ -47,16 +51,23 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
return count;
}
-static ssize_t queue_requests_show(struct request_queue *q, char *page)
+static ssize_t queue_requests_show(struct gendisk *disk, char *page)
{
- return queue_var_show(q->nr_requests, page);
+ ssize_t ret;
+
+ mutex_lock(&disk->queue->elevator_lock);
+ ret = queue_var_show(disk->queue->nr_requests, page);
+ mutex_unlock(&disk->queue->elevator_lock);
+ return ret;
}
static ssize_t
-queue_requests_store(struct request_queue *q, const char *page, size_t count)
+queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long nr;
int ret, err;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
if (!queue_is_mq(q))
return -EINVAL;
@@ -65,307 +76,294 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
if (ret < 0)
return ret;
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
- err = blk_mq_update_nr_requests(q, nr);
+ err = blk_mq_update_nr_requests(disk->queue, nr);
if (err)
- return err;
-
+ ret = err;
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
-static ssize_t queue_ra_show(struct request_queue *q, char *page)
+static ssize_t queue_ra_show(struct gendisk *disk, char *page)
{
- unsigned long ra_kb;
+ ssize_t ret;
- if (!q->disk)
- return -EINVAL;
- ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10);
- return queue_var_show(ra_kb, page);
+ mutex_lock(&disk->queue->limits_lock);
+ ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+ mutex_unlock(&disk->queue->limits_lock);
+
+ return ret;
}
static ssize_t
-queue_ra_store(struct request_queue *q, const char *page, size_t count)
+queue_ra_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long ra_kb;
ssize_t ret;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
- if (!q->disk)
- return -EINVAL;
ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
- q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
- return ret;
-}
-
-static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
-{
- int max_sectors_kb = queue_max_sectors(q) >> 1;
-
- return queue_var_show(max_sectors_kb, page);
-}
-
-static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
-{
- return queue_var_show(queue_max_segments(q), page);
-}
-
-static ssize_t queue_max_discard_segments_show(struct request_queue *q,
- char *page)
-{
- return queue_var_show(queue_max_discard_segments(q), page);
-}
-
-static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
-{
- return queue_var_show(q->limits.max_integrity_segments, page);
-}
-
-static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
-{
- return queue_var_show(queue_max_segment_size(q), page);
-}
-
-static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
-{
- return queue_var_show(queue_logical_block_size(q), page);
-}
-
-static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
-{
- return queue_var_show(queue_physical_block_size(q), page);
-}
+ /*
+ * ->ra_pages is protected by ->limits_lock because it is usually
+ * calculated from the queue limits by queue_limits_commit_update.
+ */
+ mutex_lock(&q->limits_lock);
+ memflags = blk_mq_freeze_queue(q);
+ disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+ mutex_unlock(&q->limits_lock);
+ blk_mq_unfreeze_queue(q, memflags);
-static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page)
-{
- return queue_var_show(q->limits.chunk_sectors, page);
+ return ret;
}
-static ssize_t queue_io_min_show(struct request_queue *q, char *page)
-{
- return queue_var_show(queue_io_min(q), page);
+#define QUEUE_SYSFS_LIMIT_SHOW(_field) \
+static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
+{ \
+ return queue_var_show(disk->queue->limits._field, page); \
+}
+
+QUEUE_SYSFS_LIMIT_SHOW(max_segments)
+QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
+QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
+QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
+QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
+QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity)
+QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
+QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
+QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
+QUEUE_SYSFS_LIMIT_SHOW(io_min)
+QUEUE_SYSFS_LIMIT_SHOW(io_opt)
+QUEUE_SYSFS_LIMIT_SHOW(discard_granularity)
+QUEUE_SYSFS_LIMIT_SHOW(zone_write_granularity)
+QUEUE_SYSFS_LIMIT_SHOW(virt_boundary_mask)
+QUEUE_SYSFS_LIMIT_SHOW(dma_alignment)
+QUEUE_SYSFS_LIMIT_SHOW(max_open_zones)
+QUEUE_SYSFS_LIMIT_SHOW(max_active_zones)
+QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_min)
+QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max)
+
+#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \
+static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
+{ \
+ return sysfs_emit(page, "%llu\n", \
+ (unsigned long long)disk->queue->limits._field << \
+ SECTOR_SHIFT); \
}
-static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
-{
- return queue_var_show(queue_io_opt(q), page);
-}
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
-static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
-{
- return queue_var_show(q->limits.discard_granularity, page);
+#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \
+static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
+{ \
+ return queue_var_show(disk->queue->limits._field >> 1, page); \
}
-static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
-{
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors)
- return sprintf(page, "%llu\n",
- (unsigned long long)q->limits.max_hw_discard_sectors << 9);
+#define QUEUE_SYSFS_SHOW_CONST(_name, _val) \
+static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
+{ \
+ return sysfs_emit(page, "%d\n", _val); \
}
-static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
-{
- return sprintf(page, "%llu\n",
- (unsigned long long)q->limits.max_discard_sectors << 9);
-}
+/* deprecated fields */
+QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0)
+QUEUE_SYSFS_SHOW_CONST(write_same_max, 0)
+QUEUE_SYSFS_SHOW_CONST(poll_delay, -1)
-static ssize_t queue_discard_max_store(struct request_queue *q,
- const char *page, size_t count)
+static int queue_max_discard_sectors_store(struct gendisk *disk,
+ const char *page, size_t count, struct queue_limits *lim)
{
unsigned long max_discard_bytes;
- struct queue_limits lim;
ssize_t ret;
- int err;
ret = queue_var_store(&max_discard_bytes, page, count);
if (ret < 0)
return ret;
- if (max_discard_bytes & (q->limits.discard_granularity - 1))
+ if (max_discard_bytes & (disk->queue->limits.discard_granularity - 1))
return -EINVAL;
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
- blk_mq_freeze_queue(q);
- lim = queue_limits_start_update(q);
- lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
- err = queue_limits_commit_update(q, &lim);
- blk_mq_unfreeze_queue(q);
-
- if (err)
- return err;
- return ret;
-}
-
-static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
-{
- return queue_var_show(0, page);
-}
-
-static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
-{
- return queue_var_show(0, page);
-}
-
-static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
-{
- return sprintf(page, "%llu\n",
- (unsigned long long)q->limits.max_write_zeroes_sectors << 9);
-}
-
-static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
- char *page)
-{
- return queue_var_show(queue_zone_write_granularity(q), page);
-}
-
-static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
-{
- unsigned long long max_sectors = q->limits.max_zone_append_sectors;
-
- return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
+ lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
+ return 0;
}
-static ssize_t
-queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
+static int
+queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
+ struct queue_limits *lim)
{
unsigned long max_sectors_kb;
- struct queue_limits lim;
ssize_t ret;
- int err;
ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
- blk_mq_freeze_queue(q);
- lim = queue_limits_start_update(q);
- lim.max_user_sectors = max_sectors_kb << 1;
- err = queue_limits_commit_update(q, &lim);
- blk_mq_unfreeze_queue(q);
- if (err)
- return err;
- return ret;
+ lim->max_user_sectors = max_sectors_kb << 1;
+ return 0;
}
-static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
+static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
+ size_t count, struct queue_limits *lim, blk_features_t feature)
{
- int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
-
- return queue_var_show(max_hw_sectors_kb, page);
-}
+ unsigned long val;
+ ssize_t ret;
-static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page)
-{
- return queue_var_show(q->limits.virt_boundary_mask, page);
-}
+ ret = queue_var_store(&val, page, count);
+ if (ret < 0)
+ return ret;
-static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
-{
- return queue_var_show(queue_dma_alignment(q), page);
+ if (val)
+ lim->features |= feature;
+ else
+ lim->features &= ~feature;
+ return 0;
}
-#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
-static ssize_t \
-queue_##name##_show(struct request_queue *q, char *page) \
+#define QUEUE_SYSFS_FEATURE(_name, _feature) \
+static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
{ \
- int bit; \
- bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \
- return queue_var_show(neg ? !bit : bit, page); \
+ return sysfs_emit(page, "%u\n", \
+ !!(disk->queue->limits.features & _feature)); \
} \
-static ssize_t \
-queue_##name##_store(struct request_queue *q, const char *page, size_t count) \
+static int queue_##_name##_store(struct gendisk *disk, \
+ const char *page, size_t count, struct queue_limits *lim) \
+{ \
+ return queue_feature_store(disk, page, count, lim, _feature); \
+}
+
+QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
+QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM)
+QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT)
+QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES);
+
+#define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \
+static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
{ \
- unsigned long val; \
- ssize_t ret; \
- ret = queue_var_store(&val, page, count); \
- if (ret < 0) \
- return ret; \
- if (neg) \
- val = !val; \
- \
- if (val) \
- blk_queue_flag_set(QUEUE_FLAG_##flag, q); \
- else \
- blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \
- return ret; \
+ return sysfs_emit(page, "%u\n", \
+ !!(disk->queue->limits.features & _feature)); \
}
-QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
-QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
-QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
-QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
-#undef QUEUE_SYSFS_BIT_FNS
+QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA);
+QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX);
+
+static ssize_t queue_poll_show(struct gendisk *disk, char *page)
+{
+ if (queue_is_mq(disk->queue))
+ return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue));
+
+ return sysfs_emit(page, "%u\n",
+ !!(disk->queue->limits.features & BLK_FEAT_POLL));
+}
-static ssize_t queue_zoned_show(struct request_queue *q, char *page)
+static ssize_t queue_zoned_show(struct gendisk *disk, char *page)
{
- if (blk_queue_is_zoned(q))
- return sprintf(page, "host-managed\n");
- return sprintf(page, "none\n");
+ if (blk_queue_is_zoned(disk->queue))
+ return sysfs_emit(page, "host-managed\n");
+ return sysfs_emit(page, "none\n");
}
-static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
+static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page)
{
- return queue_var_show(disk_nr_zones(q->disk), page);
+ return queue_var_show(disk_nr_zones(disk), page);
}
-static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
+static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
{
- return queue_var_show(bdev_max_open_zones(q->disk->part0), page);
+ return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page);
}
-static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
+static int queue_iostats_passthrough_store(struct gendisk *disk,
+ const char *page, size_t count, struct queue_limits *lim)
{
- return queue_var_show(bdev_max_active_zones(q->disk->part0), page);
+ unsigned long ios;
+ ssize_t ret;
+
+ ret = queue_var_store(&ios, page, count);
+ if (ret < 0)
+ return ret;
+
+ if (ios)
+ lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH;
+ else
+ lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH;
+ return 0;
}
-static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
+static ssize_t queue_nomerges_show(struct gendisk *disk, char *page)
{
- return queue_var_show((blk_queue_nomerges(q) << 1) |
- blk_queue_noxmerges(q), page);
+ return queue_var_show((blk_queue_nomerges(disk->queue) << 1) |
+ blk_queue_noxmerges(disk->queue), page);
}
-static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
+static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page,
size_t count)
{
unsigned long nm;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
ssize_t ret = queue_var_store(&nm, page, count);
if (ret < 0)
return ret;
+ memflags = blk_mq_freeze_queue(q);
blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
if (nm == 2)
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
else if (nm)
blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+ blk_mq_unfreeze_queue(q, memflags);
return ret;
}
-static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
+static ssize_t queue_rq_affinity_show(struct gendisk *disk, char *page)
{
- bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
- bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
+ bool set = test_bit(QUEUE_FLAG_SAME_COMP, &disk->queue->queue_flags);
+ bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &disk->queue->queue_flags);
return queue_var_show(set << force, page);
}
static ssize_t
-queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
+queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count)
{
ssize_t ret = -EINVAL;
#ifdef CONFIG_SMP
+ struct request_queue *q = disk->queue;
unsigned long val;
+ unsigned int memflags;
ret = queue_var_store(&val, page, count);
if (ret < 0)
return ret;
+ /*
+ * Here we update two queue flags each using atomic bitops, although
+ * updating two flags isn't atomic it should be harmless as those flags
+ * are accessed individually using atomic test_bit operation. So we
+ * don't grab any lock while updating these flags.
+ */
+ memflags = blk_mq_freeze_queue(q);
if (val == 2) {
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
@@ -376,89 +374,87 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
}
+ blk_mq_unfreeze_queue(q, memflags);
#endif
return ret;
}
-static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
-{
- return sprintf(page, "%d\n", -1);
-}
-
-static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
+static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page,
size_t count)
{
return count;
}
-static ssize_t queue_poll_show(struct request_queue *q, char *page)
-{
- return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
-}
-
-static ssize_t queue_poll_store(struct request_queue *q, const char *page,
+static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
size_t count)
{
- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- return -EINVAL;
+ unsigned int memflags;
+ ssize_t ret = count;
+ struct request_queue *q = disk->queue;
+
+ memflags = blk_mq_freeze_queue(q);
+ if (!(q->limits.features & BLK_FEAT_POLL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
pr_info_ratelimited("please use driver specific parameters instead.\n");
- return count;
+out:
+ blk_mq_unfreeze_queue(q, memflags);
+ return ret;
}
-static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
+static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page)
{
- return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout));
+ return sysfs_emit(page, "%u\n",
+ jiffies_to_msecs(READ_ONCE(disk->queue->rq_timeout)));
}
-static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page,
+static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
size_t count)
{
- unsigned int val;
+ unsigned int val, memflags;
int err;
+ struct request_queue *q = disk->queue;
err = kstrtou32(page, 10, &val);
if (err || val == 0)
return -EINVAL;
+ memflags = blk_mq_freeze_queue(q);
blk_queue_rq_timeout(q, msecs_to_jiffies(val));
+ blk_mq_unfreeze_queue(q, memflags);
return count;
}
-static ssize_t queue_wc_show(struct request_queue *q, char *page)
+static ssize_t queue_wc_show(struct gendisk *disk, char *page)
{
- if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
- return sprintf(page, "write back\n");
-
- return sprintf(page, "write through\n");
+ if (blk_queue_write_cache(disk->queue))
+ return sysfs_emit(page, "write back\n");
+ return sysfs_emit(page, "write through\n");
}
-static ssize_t queue_wc_store(struct request_queue *q, const char *page,
- size_t count)
+static int queue_wc_store(struct gendisk *disk, const char *page,
+ size_t count, struct queue_limits *lim)
{
+ bool disable;
+
if (!strncmp(page, "write back", 10)) {
- if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags))
- return -EINVAL;
- blk_queue_flag_set(QUEUE_FLAG_WC, q);
+ disable = false;
} else if (!strncmp(page, "write through", 13) ||
- !strncmp(page, "none", 4)) {
- blk_queue_flag_clear(QUEUE_FLAG_WC, q);
+ !strncmp(page, "none", 4)) {
+ disable = true;
} else {
return -EINVAL;
}
- return count;
-}
-
-static ssize_t queue_fua_show(struct request_queue *q, char *page)
-{
- return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags));
-}
-
-static ssize_t queue_dax_show(struct request_queue *q, char *page)
-{
- return queue_var_show(blk_queue_dax(q), page);
+ if (disable)
+ lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED;
+ else
+ lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED;
+ return 0;
}
#define QUEUE_RO_ENTRY(_prefix, _name) \
@@ -474,62 +470,80 @@ static struct queue_sysfs_entry _prefix##_entry = { \
.store = _prefix##_store, \
};
+#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
+ .attr = { .name = _name, .mode = 0444 }, \
+ .show_limit = _prefix##_show, \
+}
+
+#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
+ .attr = { .name = _name, .mode = 0644 }, \
+ .show_limit = _prefix##_show, \
+ .store_limit = _prefix##_store, \
+}
+
QUEUE_RW_ENTRY(queue_requests, "nr_requests");
QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
-QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
-QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
-QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
-QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
-QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
+QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments");
+QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
+QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams");
+QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity");
QUEUE_RW_ENTRY(elv_iosched, "scheduler");
-QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
-QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size");
-QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
-QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
-QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
+QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size");
+QUEUE_LIM_RO_ENTRY(queue_physical_block_size, "physical_block_size");
+QUEUE_LIM_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
+QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size");
+QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size");
-QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
-QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
-QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
-QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
+QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
+QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors,
+ "atomic_write_boundary_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
+
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
-QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
+QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
-QUEUE_RO_ENTRY(queue_zoned, "zoned");
+QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
-QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
-QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones");
+QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones");
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
+QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
QUEUE_RW_ENTRY(queue_poll, "io_poll");
QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
-QUEUE_RW_ENTRY(queue_wc, "write_cache");
-QUEUE_RO_ENTRY(queue_fua, "fua");
-QUEUE_RO_ENTRY(queue_dax, "dax");
+QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache");
+QUEUE_LIM_RO_ENTRY(queue_fua, "fua");
+QUEUE_LIM_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
-QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
-QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
-
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
-#endif
+QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment");
/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
- .attr = {.name = "hw_sector_size", .mode = 0444 },
- .show = queue_logical_block_size_show,
+ .attr = {.name = "hw_sector_size", .mode = 0444 },
+ .show_limit = queue_logical_block_size_show,
};
-QUEUE_RW_ENTRY(queue_nonrot, "rotational");
-QUEUE_RW_ENTRY(queue_iostats, "iostats");
-QUEUE_RW_ENTRY(queue_random, "add_random");
-QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
+QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational");
+QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats");
+QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random");
+QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes");
#ifdef CONFIG_BLK_WBT
static ssize_t queue_var_store64(s64 *var, const char *page)
@@ -545,23 +559,36 @@ static ssize_t queue_var_store64(s64 *var, const char *page)
return 0;
}
-static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
{
- if (!wbt_rq_qos(q))
- return -EINVAL;
+ ssize_t ret;
+ struct request_queue *q = disk->queue;
- if (wbt_disabled(q))
- return sprintf(page, "0\n");
+ mutex_lock(&disk->rqos_state_mutex);
+ if (!wbt_rq_qos(q)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (wbt_disabled(q)) {
+ ret = sysfs_emit(page, "0\n");
+ goto out;
+ }
- return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
+ ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
+out:
+ mutex_unlock(&disk->rqos_state_mutex);
+ return ret;
}
-static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
size_t count)
{
+ struct request_queue *q = disk->queue;
struct rq_qos *rqos;
ssize_t ret;
s64 val;
+ unsigned int memflags;
ret = queue_var_store64(&val, page);
if (ret < 0)
@@ -569,35 +596,40 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
if (val < -1)
return -EINVAL;
+ memflags = blk_mq_freeze_queue(q);
+
rqos = wbt_rq_qos(q);
if (!rqos) {
- ret = wbt_init(q->disk);
+ ret = wbt_init(disk);
if (ret)
- return ret;
+ goto out;
}
+ ret = count;
if (val == -1)
val = wbt_default_latency_nsec(q);
else if (val >= 0)
val *= 1000ULL;
if (wbt_get_min_lat(q) == val)
- return count;
+ goto out;
/*
* Ensure that the queue is idled, in case the latency update
* ends up either enabling or disabling wbt completely. We can't
* have IO inflight if that happens.
*/
- blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
+ mutex_lock(&disk->rqos_state_mutex);
wbt_set_min_lat(q, val);
+ mutex_unlock(&disk->rqos_state_mutex);
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+out:
+ blk_mq_unfreeze_queue(q, memflags);
- return count;
+ return ret;
}
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
@@ -605,13 +637,17 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
- &queue_ra_entry.attr,
+ /*
+ * Attributes which are protected with q->limits_lock.
+ */
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
&queue_max_segments_entry.attr,
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
+ &queue_max_write_streams_entry.attr,
+ &queue_write_stream_granularity_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
@@ -619,44 +655,60 @@ static struct attribute *queue_attrs[] = {
&queue_io_min_entry.attr,
&queue_io_opt_entry.attr,
&queue_discard_granularity_entry.attr,
- &queue_discard_max_entry.attr,
- &queue_discard_max_hw_entry.attr,
- &queue_discard_zeroes_data_entry.attr,
- &queue_write_same_max_entry.attr,
- &queue_write_zeroes_max_entry.attr,
- &queue_zone_append_max_entry.attr,
+ &queue_max_discard_sectors_entry.attr,
+ &queue_max_hw_discard_sectors_entry.attr,
+ &queue_atomic_write_max_sectors_entry.attr,
+ &queue_atomic_write_boundary_sectors_entry.attr,
+ &queue_atomic_write_unit_min_entry.attr,
+ &queue_atomic_write_unit_max_entry.attr,
+ &queue_max_write_zeroes_sectors_entry.attr,
+ &queue_max_zone_append_sectors_entry.attr,
&queue_zone_write_granularity_entry.attr,
- &queue_nonrot_entry.attr,
+ &queue_rotational_entry.attr,
&queue_zoned_entry.attr,
- &queue_nr_zones_entry.attr,
&queue_max_open_zones_entry.attr,
&queue_max_active_zones_entry.attr,
- &queue_nomerges_entry.attr,
+ &queue_iostats_passthrough_entry.attr,
&queue_iostats_entry.attr,
&queue_stable_writes_entry.attr,
- &queue_random_entry.attr,
- &queue_poll_entry.attr,
+ &queue_add_random_entry.attr,
&queue_wc_entry.attr,
&queue_fua_entry.attr,
&queue_dax_entry.attr,
- &queue_poll_delay_entry.attr,
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- &blk_throtl_sample_time_entry.attr,
-#endif
&queue_virt_boundary_mask_entry.attr,
&queue_dma_alignment_entry.attr,
+ &queue_ra_entry.attr,
+
+ /*
+ * Attributes which don't require locking.
+ */
+ &queue_discard_zeroes_data_entry.attr,
+ &queue_write_same_max_entry.attr,
+ &queue_nr_zones_entry.attr,
+ &queue_nomerges_entry.attr,
+ &queue_poll_entry.attr,
+ &queue_poll_delay_entry.attr,
+
NULL,
};
/* Request-based queue attributes that are not relevant for bio-based queues. */
static struct attribute *blk_mq_queue_attrs[] = {
- &queue_requests_entry.attr,
+ /*
+ * Attributes which require some form of locking other than
+ * q->sysfs_lock.
+ */
&elv_iosched_entry.attr,
- &queue_rq_affinity_entry.attr,
- &queue_io_timeout_entry.attr,
+ &queue_requests_entry.attr,
#ifdef CONFIG_BLK_WBT
&queue_wb_lat_entry.attr,
#endif
+ /*
+ * Attributes which don't require locking.
+ */
+ &queue_rq_affinity_entry.attr,
+ &queue_io_timeout_entry.attr,
+
NULL,
};
@@ -706,15 +758,20 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
- struct request_queue *q = disk->queue;
- ssize_t res;
- if (!entry->show)
+ if (!entry->show && !entry->show_limit)
return -EIO;
- mutex_lock(&q->sysfs_lock);
- res = entry->show(q, page);
- mutex_unlock(&q->sysfs_lock);
- return res;
+
+ if (entry->show_limit) {
+ ssize_t res;
+
+ mutex_lock(&disk->queue->limits_lock);
+ res = entry->show_limit(disk, page);
+ mutex_unlock(&disk->queue->limits_lock);
+ return res;
+ }
+
+ return entry->show(disk, page);
}
static ssize_t
@@ -724,15 +781,28 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
struct queue_sysfs_entry *entry = to_queue(attr);
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
- ssize_t res;
- if (!entry->store)
+ if (!entry->store_limit && !entry->store)
return -EIO;
- mutex_lock(&q->sysfs_lock);
- res = entry->store(q, page, length);
- mutex_unlock(&q->sysfs_lock);
- return res;
+ if (entry->store_limit) {
+ ssize_t res;
+
+ struct queue_limits lim = queue_limits_start_update(q);
+
+ res = entry->store_limit(disk, page, length, &lim);
+ if (res < 0) {
+ queue_limits_cancel_update(q);
+ return res;
+ }
+
+ res = queue_limits_commit_update_frozen(q, &lim);
+ if (res)
+ return res;
+ return length;
+ }
+
+ return entry->store(disk, page, length);
}
static const struct sysfs_ops queue_sysfs_ops = {
@@ -779,7 +849,6 @@ int blk_register_queue(struct gendisk *disk)
struct request_queue *q = disk->queue;
int ret;
- mutex_lock(&q->sysfs_dir_lock);
kobject_init(&disk->queue_kobj, &blk_queue_ktype);
ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
@@ -802,26 +871,21 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
goto out_debugfs_remove;
- if (q->elevator) {
- ret = elv_register_queue(q, false);
- if (ret)
- goto out_unregister_ia_ranges;
- }
-
ret = blk_crypto_sysfs_register(disk);
if (ret)
- goto out_elv_unregister;
+ goto out_unregister_ia_ranges;
- blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
+ if (queue_is_mq(q))
+ elevator_set_default(q);
wbt_enable_default(disk);
- blk_throtl_register(disk);
+
+ blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
if (q->elevator)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
- mutex_unlock(&q->sysfs_dir_lock);
/*
* SCSI probing may synchronously create and destroy a lot of
@@ -832,23 +896,20 @@ int blk_register_queue(struct gendisk *disk)
* faster to shut down and is made fully functional here as
* request_queues for non-existent devices never get registered.
*/
- if (!blk_queue_init_done(q)) {
- blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
- percpu_ref_switch_to_percpu(&q->q_usage_counter);
- }
+ blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
+ percpu_ref_switch_to_percpu(&q->q_usage_counter);
return ret;
-out_elv_unregister:
- elv_unregister_queue(q);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
+ if (queue_is_mq(q))
+ blk_mq_sysfs_unregister(disk);
out_put_queue_kobj:
kobject_put(&disk->queue_kobj);
- mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
@@ -879,7 +940,6 @@ void blk_unregister_queue(struct gendisk *disk)
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
mutex_unlock(&q->sysfs_lock);
- mutex_lock(&q->sysfs_dir_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
@@ -889,14 +949,15 @@ void blk_unregister_queue(struct gendisk *disk)
blk_crypto_sysfs_unregister(disk);
mutex_lock(&q->sysfs_lock);
- elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
/* Now that we've deleted all child objects, we can delete the queue. */
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
kobject_del(&disk->queue_kobj);
- mutex_unlock(&q->sysfs_dir_lock);
+
+ if (queue_is_mq(q))
+ elevator_set_none(q);
blk_debugfs_remove(disk);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f4850a6f860b..bd15357f23bd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,37 +25,12 @@
#define DFL_THROTL_SLICE_HD (HZ / 10)
#define DFL_THROTL_SLICE_SSD (HZ / 50)
#define MAX_THROTL_SLICE (HZ)
-#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
-#define MIN_THROTL_BPS (320 * 1024)
-#define MIN_THROTL_IOPS (10)
-#define DFL_LATENCY_TARGET (-1L)
-#define DFL_IDLE_THRESHOLD (0)
-#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
-#define LATENCY_FILTERED_SSD (0)
-/*
- * For HD, very small latency comes from sequential IO. Such IO is helpless to
- * help determine if its IO is impacted by others, hence we ignore the IO
- */
-#define LATENCY_FILTERED_HD (1000L) /* 1ms */
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
-/* We measure latency for request size from <= 4k to >= 1M */
-#define LATENCY_BUCKET_SIZE 9
-
-struct latency_bucket {
- unsigned long total_latency; /* ns / 1024 */
- int samples;
-};
-
-struct avg_latency_bucket {
- unsigned long latency; /* ns / 1024 */
- bool valid;
-};
-
struct throtl_data
{
/* service tree for active throtl groups */
@@ -70,19 +45,6 @@ struct throtl_data
/* Work for dispatching throttled bios */
struct work_struct dispatch_work;
- unsigned int limit_index;
- bool limit_valid[LIMIT_CNT];
-
- unsigned long low_upgrade_time;
- unsigned long low_downgrade_time;
-
- unsigned int scale;
-
- struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
- struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
- struct latency_bucket __percpu *latency_buckets[2];
- unsigned long last_calculate_time;
- unsigned long filtered_latency;
bool track_bio_latency;
};
@@ -126,89 +88,26 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
return container_of(sq, struct throtl_data, service_queue);
}
-/*
- * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
- * make the IO dispatch more smooth.
- * Scale up: linearly scale up according to elapsed time since upgrade. For
- * every throtl_slice, the limit scales up 1/2 .low limit till the
- * limit hits .max limit
- * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
- */
-static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
-{
- /* arbitrary value to avoid too big scale */
- if (td->scale < 4096 && time_after_eq(jiffies,
- td->low_upgrade_time + td->scale * td->throtl_slice))
- td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
-
- return low + (low >> 1) * td->scale;
-}
-
static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td;
- uint64_t ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return U64_MAX;
- td = tg->td;
- ret = tg->bps[rw][td->limit_index];
- if (ret == 0 && td->limit_index == LIMIT_LOW) {
- /* intermediate node or iops isn't 0 */
- if (!list_empty(&blkg->blkcg->css.children) ||
- tg->iops[rw][td->limit_index])
- return U64_MAX;
- else
- return MIN_THROTL_BPS;
- }
-
- if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
- tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
- uint64_t adjusted;
-
- adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
- ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
- }
- return ret;
+ return tg->bps[rw];
}
static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td;
- unsigned int ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return UINT_MAX;
- td = tg->td;
- ret = tg->iops[rw][td->limit_index];
- if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
- /* intermediate node or bps isn't 0 */
- if (!list_empty(&blkg->blkcg->css.children) ||
- tg->bps[rw][td->limit_index])
- return UINT_MAX;
- else
- return MIN_THROTL_IOPS;
- }
-
- if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
- tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
- uint64_t adjusted;
-
- adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
- if (adjusted > UINT_MAX)
- adjusted = UINT_MAX;
- ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
- }
- return ret;
+ return tg->iops[rw];
}
-#define request_bucket_index(sectors) \
- clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
-
/**
* throtl_log - log debug message via blktrace
* @sq: the service_queue being reported
@@ -244,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
- bio_list_init(&qn->bios);
+ bio_list_init(&qn->bios_bps);
+ bio_list_init(&qn->bios_iops);
qn->tg = tg;
}
@@ -252,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
* throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
* @bio: bio being added
* @qn: qnode to add bio to
- * @queued: the service_queue->queued[] list @qn belongs to
+ * @sq: the service_queue @qn belongs to
*
- * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * Add @bio to @qn and put @qn on @sq->queued if it's not already on.
* @qn->tg's reference count is bumped when @qn is activated. See the
* comment on top of throtl_qnode definition for details.
*/
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
- struct list_head *queued)
+ struct throtl_service_queue *sq)
{
- bio_list_add(&qn->bios, bio);
+ bool rw = bio_data_dir(bio);
+
+ /*
+ * Split bios have already been throttled by bps, so they are
+ * directly queued into the iops path.
+ */
+ if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_BPS_THROTTLED)) {
+ bio_list_add(&qn->bios_iops, bio);
+ sq->nr_queued_iops[rw]++;
+ } else {
+ bio_list_add(&qn->bios_bps, bio);
+ sq->nr_queued_bps[rw]++;
+ }
+
if (list_empty(&qn->node)) {
- list_add_tail(&qn->node, queued);
+ list_add_tail(&qn->node, &sq->queued[rw]);
blkg_get(tg_to_blkg(qn->tg));
}
}
@@ -271,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
+ *
+ * Always take a bio from the head of the iops queue first. If the queue is
+ * empty, we then take it from the bps queue to maintain the overall idea of
+ * fetching bios from the head.
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
@@ -281,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_peek(&qn->bios);
+ bio = bio_list_peek(&qn->bios_iops);
+ if (!bio)
+ bio = bio_list_peek(&qn->bios_bps);
WARN_ON_ONCE(!bio);
return bio;
}
/**
* throtl_pop_queued - pop the first bio form a qnode list
- * @queued: the qnode list to pop a bio from
+ * @sq: the service_queue to pop a bio from
* @tg_to_put: optional out argument for throtl_grp to put
+ * @rw: read/write
*
- * Pop the first bio from the qnode list @queued. After popping, the first
- * qnode is removed from @queued if empty or moved to the end of @queued so
- * that the popping order is round-robin.
+ * Pop the first bio from the qnode list @sq->queued. Note that we firstly
+ * focus on the iops list because bios are ultimately dispatched from it.
+ * After popping, the first qnode is removed from @sq->queued if empty or moved
+ * to the end of @sq->queued so that the popping order is round-robin.
*
* When the first qnode is removed, its associated throtl_grp should be put
* too. If @tg_to_put is NULL, this function automatically puts it;
* otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
* responsible for putting it.
*/
-static struct bio *throtl_pop_queued(struct list_head *queued,
- struct throtl_grp **tg_to_put)
+static struct bio *throtl_pop_queued(struct throtl_service_queue *sq,
+ struct throtl_grp **tg_to_put, bool rw)
{
+ struct list_head *queued = &sq->queued[rw];
struct throtl_qnode *qn;
struct bio *bio;
@@ -310,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_pop(&qn->bios);
+ bio = bio_list_pop(&qn->bios_iops);
+ if (bio) {
+ sq->nr_queued_iops[rw]--;
+ } else {
+ bio = bio_list_pop(&qn->bios_bps);
+ if (bio)
+ sq->nr_queued_bps[rw]--;
+ }
WARN_ON_ONCE(!bio);
- if (bio_list_empty(&qn->bios)) {
+ if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
@@ -359,20 +289,10 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk,
}
RB_CLEAR_NODE(&tg->rb_node);
- tg->bps[READ][LIMIT_MAX] = U64_MAX;
- tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
- tg->iops[READ][LIMIT_MAX] = UINT_MAX;
- tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
- tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
- tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
- tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
- tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
- /* LIMIT_LOW will have default value 0 */
-
- tg->latency_target = DFL_LATENCY_TARGET;
- tg->latency_target_conf = DFL_LATENCY_TARGET;
- tg->idletime_threshold = DFL_IDLE_THRESHOLD;
- tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
+ tg->bps[READ] = U64_MAX;
+ tg->bps[WRITE] = U64_MAX;
+ tg->iops[READ] = UINT_MAX;
+ tg->iops[WRITE] = UINT_MAX;
return &tg->pd;
@@ -418,18 +338,15 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
static void tg_update_has_rules(struct throtl_grp *tg)
{
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
- struct throtl_data *td = tg->td;
int rw;
for (rw = READ; rw <= WRITE; rw++) {
tg->has_rules_iops[rw] =
(parent_tg && parent_tg->has_rules_iops[rw]) ||
- (td->limit_valid[td->limit_index] &&
- tg_iops_limit(tg, rw) != UINT_MAX);
+ tg_iops_limit(tg, rw) != UINT_MAX;
tg->has_rules_bps[rw] =
(parent_tg && parent_tg->has_rules_bps[rw]) ||
- (td->limit_valid[td->limit_index] &&
- (tg_bps_limit(tg, rw) != U64_MAX));
+ tg_bps_limit(tg, rw) != U64_MAX;
}
}
@@ -443,54 +360,11 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
tg_update_has_rules(tg);
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
- bool low_valid = false;
-
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
-
- if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
- tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
- low_valid = true;
- break;
- }
- }
- rcu_read_unlock();
-
- td->limit_valid[LIMIT_LOW] = low_valid;
-}
-#else
-static inline void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
-}
-#endif
-
-static void throtl_upgrade_state(struct throtl_data *td);
-static void throtl_pd_offline(struct blkg_policy_data *pd)
-{
- struct throtl_grp *tg = pd_to_tg(pd);
-
- tg->bps[READ][LIMIT_LOW] = 0;
- tg->bps[WRITE][LIMIT_LOW] = 0;
- tg->iops[READ][LIMIT_LOW] = 0;
- tg->iops[WRITE][LIMIT_LOW] = 0;
-
- blk_throtl_update_limit_valid(tg->td);
-
- if (!tg->td->limit_valid[tg->td->limit_index])
- throtl_upgrade_state(tg->td);
-}
-
static void throtl_pd_free(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
- del_timer_sync(&tg->service_queue.pending_timer);
+ timer_delete_sync(&tg->service_queue.pending_timer);
blkg_rwstat_exit(&tg->stat_bytes);
blkg_rwstat_exit(&tg->stat_ios);
kfree(tg);
@@ -635,8 +509,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
- tg->carryover_bytes[rw] = 0;
- tg->carryover_ios[rw] = 0;
/*
* Previous slice has expired. We must have trimmed it after last
@@ -655,16 +527,14 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
}
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
- bool clear_carryover)
+ bool clear)
{
- tg->bytes_disp[rw] = 0;
- tg->io_disp[rw] = 0;
+ if (clear) {
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+ }
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
- if (clear_carryover) {
- tg->carryover_bytes[rw] = 0;
- tg->carryover_ios[rw] = 0;
- }
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -681,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
+ if (!time_before(tg->slice_end[rw], jiffy_end))
+ return;
+
throtl_set_slice_end(tg, rw, jiffy_end);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
@@ -697,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
return true;
}
+static unsigned int sq_queued(struct throtl_service_queue *sq, int type)
+{
+ return sq->nr_queued_bps[type] + sq->nr_queued_iops[type];
+}
+
static unsigned int calculate_io_allowed(u32 iops_limit,
unsigned long jiffy_elapsed)
{
@@ -732,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
+static long long throtl_trim_bps(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ long long bytes_trim;
+
+ if (bps_limit == U64_MAX)
+ return 0;
+
+ /* Need to consider the case of bytes_allowed overflow. */
+ bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed);
+ if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) {
+ bytes_trim = tg->bytes_disp[rw];
+ tg->bytes_disp[rw] = 0;
+ } else {
+ tg->bytes_disp[rw] -= bytes_trim;
+ }
+
+ return bytes_trim;
+}
+
+static int throtl_trim_iops(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u32 iops_limit = tg_iops_limit(tg, rw);
+ int io_trim;
+
+ if (iops_limit == UINT_MAX)
+ return 0;
+
+ /* Need to consider the case of io_allowed overflow. */
+ io_trim = calculate_io_allowed(iops_limit, time_elapsed);
+ if (io_trim <= 0 || tg->io_disp[rw] < io_trim) {
+ io_trim = tg->io_disp[rw];
+ tg->io_disp[rw] = 0;
+ } else {
+ tg->io_disp[rw] -= io_trim;
+ }
+
+ return io_trim;
+}
+
/* Trim the used slices and adjust slice start accordingly */
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
@@ -756,34 +676,28 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* sooner, then we need to reduce slice_end. A high bogus slice_end
* is bad because it does not allow new slice to start.
*/
-
throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
time_elapsed = rounddown(jiffies - tg->slice_start[rw],
tg->td->throtl_slice);
- if (!time_elapsed)
+ /* Don't trim slice until at least 2 slices are used */
+ if (time_elapsed < tg->td->throtl_slice * 2)
return;
- bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
- time_elapsed) +
- tg->carryover_bytes[rw];
- io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) +
- tg->carryover_ios[rw];
- if (bytes_trim <= 0 && io_trim <= 0)
+ /*
+ * The bio submission time may be a few jiffies more than the expected
+ * waiting time, due to 'extra_bytes' can't be divided in
+ * tg_within_bps_limit(), and also due to timer wakeup delay. In this
+ * case, adjust slice_start will discard the extra wait time, causing
+ * lower rate than expected. Therefore, other than the above rounddown,
+ * one extra slice is preserved for deviation.
+ */
+ time_elapsed -= tg->td->throtl_slice;
+ bytes_trim = throtl_trim_bps(tg, rw, time_elapsed);
+ io_trim = throtl_trim_iops(tg, rw, time_elapsed);
+ if (!bytes_trim && !io_trim)
return;
- tg->carryover_bytes[rw] = 0;
- if ((long long)tg->bytes_disp[rw] >= bytes_trim)
- tg->bytes_disp[rw] -= bytes_trim;
- else
- tg->bytes_disp[rw] = 0;
-
- tg->carryover_ios[rw] = 0;
- if ((int)tg->io_disp[rw] >= io_trim)
- tg->io_disp[rw] -= io_trim;
- else
- tg->io_disp[rw] = 0;
-
tg->slice_start[rw] += time_elapsed;
throtl_log(&tg->service_queue,
@@ -793,39 +707,60 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
jiffies);
}
-static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
+static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
+ long long *bytes, int *ios)
{
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
+ long long bytes_allowed;
+ int io_allowed;
+
+ /*
+ * If the queue is empty, carryover handling is not needed. In such cases,
+ * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch
+ * of subsequent bios. The same handling applies when the previous BPS/IOPS
+ * limit was set to max.
+ */
+ if (sq_queued(&tg->service_queue, rw) == 0) {
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+ return;
+ }
/*
* If config is updated while bios are still throttled, calculate and
- * accumulate how many bytes/ios are waited across changes. And
- * carryover_bytes/ios will be used to calculate new wait time under new
- * configuration.
+ * accumulate how many bytes/ios are waited across changes. And use the
+ * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which
+ * will be used to calculate new wait time under new configuration.
+ * And we need to consider the case of bytes/io_allowed overflow.
*/
- if (bps_limit != U64_MAX)
- tg->carryover_bytes[rw] +=
- calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
- tg->bytes_disp[rw];
- if (iops_limit != UINT_MAX)
- tg->carryover_ios[rw] +=
- calculate_io_allowed(iops_limit, jiffy_elapsed) -
- tg->io_disp[rw];
+ if (bps_limit != U64_MAX) {
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed);
+ if (bytes_allowed > 0)
+ *bytes = bytes_allowed - tg->bytes_disp[rw];
+ }
+ if (iops_limit != UINT_MAX) {
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed);
+ if (io_allowed > 0)
+ *ios = io_allowed - tg->io_disp[rw];
+ }
+
+ tg->bytes_disp[rw] = -*bytes;
+ tg->io_disp[rw] = -*ios;
}
static void tg_update_carryover(struct throtl_grp *tg)
{
- if (tg->service_queue.nr_queued[READ])
- __tg_update_carryover(tg, READ);
- if (tg->service_queue.nr_queued[WRITE])
- __tg_update_carryover(tg, WRITE);
+ long long bytes[2] = {0};
+ int ios[2] = {0};
- /* see comments in struct throtl_grp for meaning of these fields. */
+ __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
+ __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
+
+ /* see comments in struct throtl_grp for meaning of carryover. */
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
- tg->carryover_bytes[READ], tg->carryover_bytes[WRITE],
- tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
+ bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
}
static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
@@ -835,21 +770,19 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
- if (iops_limit == UINT_MAX) {
- return 0;
- }
-
jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
- io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
- tg->carryover_ios[rw];
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd);
if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed)
return 0;
/* Calc approx time to dispatch */
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
+
+ /* make sure at least one io can be dispatched after waiting */
+ jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1);
return jiffy_wait;
}
@@ -862,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
- /* no need to throttle if this bio's bytes have been accounted */
- if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
- return 0;
- }
-
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */
@@ -874,9 +802,10 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
- bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
- tg->carryover_bytes[rw];
- if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
+ /* Need to consider the case of bytes_allowed overflow. */
+ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ || bytes_allowed < 0)
return 0;
/* Calc approx time to dispatch */
@@ -894,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
return jiffy_wait;
}
+static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ unsigned int bio_size = throtl_bio_data_size(bio);
+
+ /* Charge the bio to the group */
+ if (!bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) {
+ bio_set_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->bytes_disp[bio_data_dir(bio)] += bio_size;
+ }
+}
+
+static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ bio_clear_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->io_disp[bio_data_dir(bio)]++;
+}
+
/*
- * Returns whether one can dispatch a bio or not. Also returns approx number
- * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ * If previous slice expired, start a new one otherwise renew/extend existing
+ * slice to make sure it is at least throtl_slice interval long since now. New
+ * slice is started only for empty throttle group. If there is queued bio, that
+ * means there should be an active slice and it should be extended instead.
*/
-static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+static void tg_update_slice(struct throtl_grp *tg, bool rw)
+{
+ if (throtl_slice_used(tg, rw) &&
+ sq_queued(&tg->service_queue, rw) == 0)
+ throtl_start_new_slice(tg, rw, true);
+ else
+ throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
+}
+
+static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
- unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
u64 bps_limit = tg_bps_limit(tg, rw);
+ unsigned long bps_wait;
+
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING ||
+ bio_flagged(bio, BIO_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_TG_BPS_THROTTLED))
+ return 0;
+
+ tg_update_slice(tg, rw);
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ throtl_extend_slice(tg, rw, jiffies + bps_wait);
+
+ return bps_wait;
+}
+
+static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
u32 iops_limit = tg_iops_limit(tg, rw);
+ unsigned long iops_wait;
+
+ if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING)
+ return 0;
+
+ tg_update_slice(tg, rw);
+ iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
+ throtl_extend_slice(tg, rw, jiffies + iops_wait);
+
+ return iops_wait;
+}
+
+/*
+ * Returns approx number of jiffies to wait before this bio is with-in IO rate
+ * and can be moved to other queue or dispatched.
+ */
+static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
+ unsigned long wait;
/*
* Currently whole state machine of group depends on first bio
@@ -912,65 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* this function with a different bio if there are other bios
* queued.
*/
- BUG_ON(tg->service_queue.nr_queued[rw] &&
+ BUG_ON(sq_queued(&tg->service_queue, rw) &&
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
- /* If tg->bps = -1, then BW is unlimited */
- if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
- tg->flags & THROTL_TG_CANCELING) {
- if (wait)
- *wait = 0;
- return true;
- }
+ wait = tg_dispatch_bps_time(tg, bio);
+ if (wait != 0)
+ return wait;
/*
- * If previous slice expired, start a new one otherwise renew/extend
- * existing slice to make sure it is at least throtl_slice interval
- * long since now. New slice is started only for empty throttle group.
- * If there is queued bio, that means there should be an active
- * slice and it should be extended instead.
+ * Charge bps here because @bio will be directly placed into the
+ * iops queue afterward.
*/
- if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
- throtl_start_new_slice(tg, rw, true);
- else {
- if (time_before(tg->slice_end[rw],
- jiffies + tg->td->throtl_slice))
- throtl_extend_slice(tg, rw,
- jiffies + tg->td->throtl_slice);
- }
+ throtl_charge_bps_bio(tg, bio);
- bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
- iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
- if (bps_wait + iops_wait == 0) {
- if (wait)
- *wait = 0;
- return true;
- }
-
- max_wait = max(bps_wait, iops_wait);
-
- if (wait)
- *wait = max_wait;
-
- if (time_before(tg->slice_end[rw], jiffies + max_wait))
- throtl_extend_slice(tg, rw, jiffies + max_wait);
-
- return false;
-}
-
-static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
-{
- bool rw = bio_data_dir(bio);
- unsigned int bio_size = throtl_bio_data_size(bio);
-
- /* Charge the bio to the group */
- if (!bio_flagged(bio, BIO_BPS_THROTTLED)) {
- tg->bytes_disp[rw] += bio_size;
- tg->last_bytes_disp[rw] += bio_size;
- }
-
- tg->io_disp[rw]++;
- tg->last_io_disp[rw]++;
+ return tg_dispatch_iops_time(tg, bio);
}
/**
@@ -997,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* dispatched. Mark that @tg was empty. This is automatically
* cleared on the next tg_update_disptime().
*/
- if (!sq->nr_queued[rw])
+ if (sq_queued(sq, rw) == 0)
tg->flags |= THROTL_TG_WAS_EMPTY;
- throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+ throtl_qnode_add_bio(bio, qn, sq);
+
+ /*
+ * Since we have split the queues, when the iops queue is
+ * previously empty and a new @bio is added into the first @qn,
+ * we also need to update the @tg->disptime.
+ */
+ if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ bio == throtl_peek_queued(&sq->queued[rw]))
+ tg->flags |= THROTL_TG_IOPS_WAS_EMPTY;
- sq->nr_queued[rw]++;
throtl_enqueue_tg(tg);
}
static void tg_update_disptime(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
- unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+ unsigned long read_wait = -1, write_wait = -1, min_wait, disptime;
struct bio *bio;
bio = throtl_peek_queued(&sq->queued[READ]);
if (bio)
- tg_may_dispatch(tg, bio, &read_wait);
+ read_wait = tg_dispatch_time(tg, bio);
bio = throtl_peek_queued(&sq->queued[WRITE]);
if (bio)
- tg_may_dispatch(tg, bio, &write_wait);
+ write_wait = tg_dispatch_time(tg, bio);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
@@ -1030,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg)
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
+ tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY;
}
static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
@@ -1056,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
* getting released prematurely. Remember the tg to put and put it
* after @bio is transferred to @parent_sq.
*/
- bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
- sq->nr_queued[rw]--;
+ bio = throtl_pop_queued(sq, &tg_to_put, rw);
- throtl_charge_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
/*
* If our parent is another tg, we just need to transfer @bio to
@@ -1074,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
} else {
bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
- &parent_sq->queued[rw]);
+ parent_sq);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
}
@@ -1096,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
/* Try to dispatch 75% READS and 25% WRITES */
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, READ);
nr_reads++;
@@ -1106,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
}
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, WRITE);
nr_writes++;
@@ -1139,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue;
- if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
+ if (sq_queued(sq, READ) || sq_queued(sq, WRITE))
tg_update_disptime(tg);
else
throtl_dequeue_tg(tg);
@@ -1151,8 +1108,6 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
return nr_disp;
}
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg);
/**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
* @t: the pending_timer member of the throtl_service_queue being serviced
@@ -1189,17 +1144,16 @@ static void throtl_pending_timer_fn(struct timer_list *t)
if (!q->root_blkg)
goto out_unlock;
- if (throtl_can_upgrade(td, NULL))
- throtl_upgrade_state(td);
-
again:
parent_sq = sq->parent_sq;
dispatched = false;
while (true) {
+ unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ);
+ unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE);
+
throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
- sq->nr_queued[READ] + sq->nr_queued[WRITE],
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w);
ret = throtl_select_dispatch(sq);
if (ret) {
@@ -1221,7 +1175,8 @@ again:
if (parent_sq) {
/* @parent_sq is another throl_grp, propagate dispatch */
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
if (!throtl_schedule_next_dispatch(parent_sq, false)) {
/* window is already open, repeat dispatching */
@@ -1261,7 +1216,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
spin_lock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+ while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(&q->queue_lock);
@@ -1331,22 +1286,12 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
blkg_for_each_descendant_pre(blkg, pos_css,
global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
struct throtl_grp *this_tg = blkg_to_tg(blkg);
- struct throtl_grp *parent_tg;
tg_update_has_rules(this_tg);
/* ignore root/second level */
if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
!blkg->parent->parent)
continue;
- parent_tg = blkg_to_tg(blkg->parent);
- /*
- * make sure all children has lower idle time threshold and
- * higher latency target
- */
- this_tg->idletime_threshold = min(this_tg->idletime_threshold,
- parent_tg->idletime_threshold);
- this_tg->latency_target = max(this_tg->latency_target,
- parent_tg->latency_target);
}
rcu_read_unlock();
@@ -1367,6 +1312,54 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
}
}
+static int blk_throtl_init(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct throtl_data *td;
+ unsigned int memflags;
+ int ret;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+ if (!td)
+ return -ENOMEM;
+
+ INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+ throtl_service_queue_init(&td->service_queue);
+
+ /*
+ * Freeze queue before activating policy, to synchronize with IO path,
+ * which is protected by 'q_usage_counter'.
+ */
+ memflags = blk_mq_freeze_queue(disk->queue);
+ blk_mq_quiesce_queue(disk->queue);
+
+ q->td = td;
+ td->queue = q;
+
+ /* activate policy */
+ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
+ if (ret) {
+ q->td = NULL;
+ kfree(td);
+ goto out;
+ }
+
+ if (blk_queue_nonrot(q))
+ td->throtl_slice = DFL_THROTL_SLICE_SSD;
+ else
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+ td->track_bio_latency = !queue_is_mq(q);
+ if (!td->track_bio_latency)
+ blk_stat_enable_accounting(q);
+
+out:
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue, memflags);
+
+ return ret;
+}
+
+
static ssize_t tg_set_conf(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off, bool is_u64)
{
@@ -1378,6 +1371,16 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto out_finish;
+
+ if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
+ ret = blk_throtl_init(ctx.bdev->bd_disk);
+ if (ret)
+ goto out_finish;
+ }
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
goto out_finish;
@@ -1444,25 +1447,25 @@ static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
- .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, bps[READ]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.write_bps_device",
- .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, bps[WRITE]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.read_iops_device",
- .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, iops[READ]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
{
.name = "throttle.write_iops_device",
- .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, iops[WRITE]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
@@ -1494,61 +1497,43 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
{
struct throtl_grp *tg = pd_to_tg(pd);
const char *dname = blkg_dev_name(pd->blkg);
- char bufs[4][21] = { "max", "max", "max", "max" };
u64 bps_dft;
unsigned int iops_dft;
- char idle_time[26] = "";
- char latency_time[26] = "";
if (!dname)
return 0;
- if (off == LIMIT_LOW) {
- bps_dft = 0;
- iops_dft = 0;
- } else {
- bps_dft = U64_MAX;
- iops_dft = UINT_MAX;
- }
+ bps_dft = U64_MAX;
+ iops_dft = UINT_MAX;
- if (tg->bps_conf[READ][off] == bps_dft &&
- tg->bps_conf[WRITE][off] == bps_dft &&
- tg->iops_conf[READ][off] == iops_dft &&
- tg->iops_conf[WRITE][off] == iops_dft &&
- (off != LIMIT_LOW ||
- (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
- tg->latency_target_conf == DFL_LATENCY_TARGET)))
+ if (tg->bps[READ] == bps_dft &&
+ tg->bps[WRITE] == bps_dft &&
+ tg->iops[READ] == iops_dft &&
+ tg->iops[WRITE] == iops_dft)
return 0;
- if (tg->bps_conf[READ][off] != U64_MAX)
- snprintf(bufs[0], sizeof(bufs[0]), "%llu",
- tg->bps_conf[READ][off]);
- if (tg->bps_conf[WRITE][off] != U64_MAX)
- snprintf(bufs[1], sizeof(bufs[1]), "%llu",
- tg->bps_conf[WRITE][off]);
- if (tg->iops_conf[READ][off] != UINT_MAX)
- snprintf(bufs[2], sizeof(bufs[2]), "%u",
- tg->iops_conf[READ][off]);
- if (tg->iops_conf[WRITE][off] != UINT_MAX)
- snprintf(bufs[3], sizeof(bufs[3]), "%u",
- tg->iops_conf[WRITE][off]);
- if (off == LIMIT_LOW) {
- if (tg->idletime_threshold_conf == ULONG_MAX)
- strcpy(idle_time, " idle=max");
- else
- snprintf(idle_time, sizeof(idle_time), " idle=%lu",
- tg->idletime_threshold_conf);
+ seq_printf(sf, "%s", dname);
+ if (tg->bps[READ] == U64_MAX)
+ seq_printf(sf, " rbps=max");
+ else
+ seq_printf(sf, " rbps=%llu", tg->bps[READ]);
- if (tg->latency_target_conf == ULONG_MAX)
- strcpy(latency_time, " latency=max");
- else
- snprintf(latency_time, sizeof(latency_time),
- " latency=%lu", tg->latency_target_conf);
- }
+ if (tg->bps[WRITE] == U64_MAX)
+ seq_printf(sf, " wbps=max");
+ else
+ seq_printf(sf, " wbps=%llu", tg->bps[WRITE]);
- seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
- dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
- latency_time);
+ if (tg->iops[READ] == UINT_MAX)
+ seq_printf(sf, " riops=max");
+ else
+ seq_printf(sf, " riops=%u", tg->iops[READ]);
+
+ if (tg->iops[WRITE] == UINT_MAX)
+ seq_printf(sf, " wiops=max");
+ else
+ seq_printf(sf, " wiops=%u", tg->iops[WRITE]);
+
+ seq_printf(sf, "\n");
return 0;
}
@@ -1566,13 +1551,20 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
u64 v[4];
- unsigned long idle_time;
- unsigned long latency_time;
int ret;
- int index = of_cft(of)->private;
blkg_conf_init(&ctx, buf);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto out_finish;
+
+ if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
+ ret = blk_throtl_init(ctx.bdev->bd_disk);
+ if (ret)
+ goto out_finish;
+ }
+
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
goto out_finish;
@@ -1580,13 +1572,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
- v[0] = tg->bps_conf[READ][index];
- v[1] = tg->bps_conf[WRITE][index];
- v[2] = tg->iops_conf[READ][index];
- v[3] = tg->iops_conf[WRITE][index];
+ v[0] = tg->bps[READ];
+ v[1] = tg->bps[WRITE];
+ v[2] = tg->iops[READ];
+ v[3] = tg->iops[WRITE];
- idle_time = tg->idletime_threshold_conf;
- latency_time = tg->latency_target_conf;
while (true) {
char tok[27]; /* wiops=18446744073709551616 */
char *p;
@@ -1610,68 +1600,24 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
goto out_finish;
ret = -EINVAL;
- if (!strcmp(tok, "rbps") && val > 1)
+ if (!strcmp(tok, "rbps"))
v[0] = val;
- else if (!strcmp(tok, "wbps") && val > 1)
+ else if (!strcmp(tok, "wbps"))
v[1] = val;
- else if (!strcmp(tok, "riops") && val > 1)
+ else if (!strcmp(tok, "riops"))
v[2] = min_t(u64, val, UINT_MAX);
- else if (!strcmp(tok, "wiops") && val > 1)
+ else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
- else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
- idle_time = val;
- else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
- latency_time = val;
else
goto out_finish;
}
- tg->bps_conf[READ][index] = v[0];
- tg->bps_conf[WRITE][index] = v[1];
- tg->iops_conf[READ][index] = v[2];
- tg->iops_conf[WRITE][index] = v[3];
+ tg->bps[READ] = v[0];
+ tg->bps[WRITE] = v[1];
+ tg->iops[READ] = v[2];
+ tg->iops[WRITE] = v[3];
- if (index == LIMIT_MAX) {
- tg->bps[READ][index] = v[0];
- tg->bps[WRITE][index] = v[1];
- tg->iops[READ][index] = v[2];
- tg->iops[WRITE][index] = v[3];
- }
- tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
- tg->bps_conf[READ][LIMIT_MAX]);
- tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
- tg->bps_conf[WRITE][LIMIT_MAX]);
- tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
- tg->iops_conf[READ][LIMIT_MAX]);
- tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
- tg->iops_conf[WRITE][LIMIT_MAX]);
- tg->idletime_threshold_conf = idle_time;
- tg->latency_target_conf = latency_time;
-
- /* force user to configure all settings for low limit */
- if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
- tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
- tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
- tg->latency_target_conf == DFL_LATENCY_TARGET) {
- tg->bps[READ][LIMIT_LOW] = 0;
- tg->bps[WRITE][LIMIT_LOW] = 0;
- tg->iops[READ][LIMIT_LOW] = 0;
- tg->iops[WRITE][LIMIT_LOW] = 0;
- tg->idletime_threshold = DFL_IDLE_THRESHOLD;
- tg->latency_target = DFL_LATENCY_TARGET;
- } else if (index == LIMIT_LOW) {
- tg->idletime_threshold = tg->idletime_threshold_conf;
- tg->latency_target = tg->latency_target_conf;
- }
-
- blk_throtl_update_limit_valid(tg->td);
- if (tg->td->limit_valid[LIMIT_LOW]) {
- if (index == LIMIT_LOW)
- tg->td->limit_index = LIMIT_LOW;
- } else
- tg->td->limit_index = LIMIT_MAX;
- tg_conf_updated(tg, index == LIMIT_LOW &&
- tg->td->limit_valid[LIMIT_LOW]);
+ tg_conf_updated(tg, false);
ret = 0;
out_finish:
blkg_conf_exit(&ctx);
@@ -1679,21 +1625,11 @@ out_finish:
}
static struct cftype throtl_files[] = {
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- {
- .name = "low",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = tg_print_limit,
- .write = tg_set_limit,
- .private = LIMIT_LOW,
- },
-#endif
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = tg_print_limit,
.write = tg_set_limit,
- .private = LIMIT_MAX,
},
{ } /* terminate */
};
@@ -1705,6 +1641,42 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
+static void tg_flush_bios(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ if (tg->flags & THROTL_TG_CANCELING)
+ return;
+ /*
+ * Set the flag to make sure throtl_pending_timer_fn() won't
+ * stop until all throttled bios are dispatched.
+ */
+ tg->flags |= THROTL_TG_CANCELING;
+
+ /*
+ * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
+ * will be inserted to service queue without THROTL_TG_PENDING
+ * set in tg_update_disptime below. Then IO dispatched from
+ * child in tg_dispatch_one_bio will trigger double insertion
+ * and corrupt the tree.
+ */
+ if (!(tg->flags & THROTL_TG_PENDING))
+ return;
+
+ /*
+ * Update disptime after setting the above flag to make sure
+ * throtl_select_dispatch() won't exit without dispatching.
+ */
+ tg_update_disptime(tg);
+
+ throtl_schedule_pending_timer(sq, jiffies + 1);
+}
+
+static void throtl_pd_offline(struct blkg_policy_data *pd)
+{
+ tg_flush_bios(pd_to_tg(pd));
+}
+
struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
@@ -1722,6 +1694,9 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
+ if (!blk_throtl_activated(q))
+ return;
+
spin_lock_irq(&q->queue_lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
@@ -1730,449 +1705,48 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
*/
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct throtl_service_queue *sq = &tg->service_queue;
-
- /*
- * Set the flag to make sure throtl_pending_timer_fn() won't
- * stop until all throttled bios are dispatched.
- */
- tg->flags |= THROTL_TG_CANCELING;
-
- /*
- * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
- * will be inserted to service queue without THROTL_TG_PENDING
- * set in tg_update_disptime below. Then IO dispatched from
- * child in tg_dispatch_one_bio will trigger double insertion
- * and corrupt the tree.
- */
- if (!(tg->flags & THROTL_TG_PENDING))
- continue;
-
/*
- * Update disptime after setting the above flag to make sure
- * throtl_select_dispatch() won't exit without dispatching.
+ * disk_release will call pd_offline_fn to cancel bios.
+ * However, disk_release can't be called if someone get
+ * the refcount of device and issued bios which are
+ * inflight after del_gendisk.
+ * Cancel bios here to ensure no bios are inflight after
+ * del_gendisk.
*/
- tg_update_disptime(tg);
-
- throtl_schedule_pending_timer(sq, jiffies + 1);
+ tg_flush_bios(blkg_to_tg(blkg));
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
-{
- unsigned long rtime = jiffies, wtime = jiffies;
-
- if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
- rtime = tg->last_low_overflow_time[READ];
- if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
- wtime = tg->last_low_overflow_time[WRITE];
- return min(rtime, wtime);
-}
-
-static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
-{
- struct throtl_service_queue *parent_sq;
- struct throtl_grp *parent = tg;
- unsigned long ret = __tg_last_low_overflow_time(tg);
-
- while (true) {
- parent_sq = parent->service_queue.parent_sq;
- parent = sq_to_tg(parent_sq);
- if (!parent)
- break;
-
- /*
- * The parent doesn't have low limit, it always reaches low
- * limit. Its overflow time is useless for children
- */
- if (!parent->bps[READ][LIMIT_LOW] &&
- !parent->iops[READ][LIMIT_LOW] &&
- !parent->bps[WRITE][LIMIT_LOW] &&
- !parent->iops[WRITE][LIMIT_LOW])
- continue;
- if (time_after(__tg_last_low_overflow_time(parent), ret))
- ret = __tg_last_low_overflow_time(parent);
- }
- return ret;
-}
-
-static bool throtl_tg_is_idle(struct throtl_grp *tg)
-{
- /*
- * cgroup is idle if:
- * - single idle is too long, longer than a fixed value (in case user
- * configure a too big threshold) or 4 times of idletime threshold
- * - average think time is more than threshold
- * - IO latency is largely below threshold
- */
- unsigned long time;
- bool ret;
-
- time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
- ret = tg->latency_target == DFL_LATENCY_TARGET ||
- tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
- (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
- tg->avg_idletime > tg->idletime_threshold ||
- (tg->latency_target && tg->bio_cnt &&
- tg->bad_bio_cnt * 5 < tg->bio_cnt);
- throtl_log(&tg->service_queue,
- "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
- tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
- tg->bio_cnt, ret, tg->td->scale);
- return ret;
-}
-
-static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
+static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
{
struct throtl_service_queue *sq = &tg->service_queue;
- bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
/*
- * if low limit is zero, low limit is always reached.
- * if low limit is non-zero, we can check if there is any request
- * is queued to determine if low limit is reached as we throttle
- * request according to limit.
+ * For a split bio, we need to specifically distinguish whether the
+ * iops queue is empty.
*/
- return !limit || sq->nr_queued[rw];
-}
+ if (bio_flagged(bio, BIO_BPS_THROTTLED))
+ return sq->nr_queued_iops[rw] == 0 &&
+ tg_dispatch_iops_time(tg, bio) == 0;
-static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
-{
/*
- * cgroup reaches low limit when low limit of READ and WRITE are
- * both reached, it's ok to upgrade to next limit if cgroup reaches
- * low limit
+ * Throtl is FIFO - if bios are already queued, should queue.
+ * If the bps queue is empty and @bio is within the bps limit, charge
+ * bps here for direct placement into the iops queue.
*/
- if (throtl_low_limit_reached(tg, READ) &&
- throtl_low_limit_reached(tg, WRITE))
- return true;
-
- if (time_after_eq(jiffies,
- tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
- throtl_tg_is_idle(tg))
- return true;
- return false;
-}
+ if (sq_queued(&tg->service_queue, rw)) {
+ if (sq->nr_queued_bps[rw] == 0 &&
+ tg_dispatch_bps_time(tg, bio) == 0)
+ throtl_charge_bps_bio(tg, bio);
-static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
-{
- while (true) {
- if (throtl_tg_can_upgrade(tg))
- return true;
- tg = sq_to_tg(tg->service_queue.parent_sq);
- if (!tg || !tg_to_blkg(tg)->parent)
- return false;
- }
- return false;
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
-
- if (td->limit_index != LIMIT_LOW)
- return false;
-
- if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
return false;
-
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
-
- if (tg == this_tg)
- continue;
- if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
- continue;
- if (!throtl_hierarchy_can_upgrade(tg)) {
- rcu_read_unlock();
- return false;
- }
- }
- rcu_read_unlock();
- return true;
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
- unsigned long now = jiffies;
-
- if (tg->td->limit_index != LIMIT_LOW)
- return;
-
- if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
- return;
-
- tg->last_check_time = now;
-
- if (!time_after_eq(now,
- __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
- return;
-
- if (throtl_can_upgrade(tg->td, NULL))
- throtl_upgrade_state(tg->td);
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
-
- throtl_log(&td->service_queue, "upgrade to max");
- td->limit_index = LIMIT_MAX;
- td->low_upgrade_time = jiffies;
- td->scale = 0;
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct throtl_service_queue *sq = &tg->service_queue;
-
- tg->disptime = jiffies - 1;
- throtl_select_dispatch(sq);
- throtl_schedule_next_dispatch(sq, true);
- }
- rcu_read_unlock();
- throtl_select_dispatch(&td->service_queue);
- throtl_schedule_next_dispatch(&td->service_queue, true);
- queue_work(kthrotld_workqueue, &td->dispatch_work);
-}
-
-static void throtl_downgrade_state(struct throtl_data *td)
-{
- td->scale /= 2;
-
- throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
- if (td->scale) {
- td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
- return;
- }
-
- td->limit_index = LIMIT_LOW;
- td->low_downgrade_time = jiffies;
-}
-
-static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
-{
- struct throtl_data *td = tg->td;
- unsigned long now = jiffies;
-
- /*
- * If cgroup is below low limit, consider downgrade and throttle other
- * cgroups
- */
- if (time_after_eq(now, tg_last_low_overflow_time(tg) +
- td->throtl_slice) &&
- (!throtl_tg_is_idle(tg) ||
- !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
- return true;
- return false;
-}
-
-static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
-{
- struct throtl_data *td = tg->td;
-
- if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
- return false;
-
- while (true) {
- if (!throtl_tg_can_downgrade(tg))
- return false;
- tg = sq_to_tg(tg->service_queue.parent_sq);
- if (!tg || !tg_to_blkg(tg)->parent)
- break;
- }
- return true;
-}
-
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
- uint64_t bps;
- unsigned int iops;
- unsigned long elapsed_time;
- unsigned long now = jiffies;
-
- if (tg->td->limit_index != LIMIT_MAX ||
- !tg->td->limit_valid[LIMIT_LOW])
- return;
- if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
- return;
- if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
- return;
-
- elapsed_time = now - tg->last_check_time;
- tg->last_check_time = now;
-
- if (time_before(now, tg_last_low_overflow_time(tg) +
- tg->td->throtl_slice))
- return;
-
- if (tg->bps[READ][LIMIT_LOW]) {
- bps = tg->last_bytes_disp[READ] * HZ;
- do_div(bps, elapsed_time);
- if (bps >= tg->bps[READ][LIMIT_LOW])
- tg->last_low_overflow_time[READ] = now;
- }
-
- if (tg->bps[WRITE][LIMIT_LOW]) {
- bps = tg->last_bytes_disp[WRITE] * HZ;
- do_div(bps, elapsed_time);
- if (bps >= tg->bps[WRITE][LIMIT_LOW])
- tg->last_low_overflow_time[WRITE] = now;
- }
-
- if (tg->iops[READ][LIMIT_LOW]) {
- iops = tg->last_io_disp[READ] * HZ / elapsed_time;
- if (iops >= tg->iops[READ][LIMIT_LOW])
- tg->last_low_overflow_time[READ] = now;
- }
-
- if (tg->iops[WRITE][LIMIT_LOW]) {
- iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
- if (iops >= tg->iops[WRITE][LIMIT_LOW])
- tg->last_low_overflow_time[WRITE] = now;
- }
-
- /*
- * If cgroup is below low limit, consider downgrade and throttle other
- * cgroups
- */
- if (throtl_hierarchy_can_downgrade(tg))
- throtl_downgrade_state(tg->td);
-
- tg->last_bytes_disp[READ] = 0;
- tg->last_bytes_disp[WRITE] = 0;
- tg->last_io_disp[READ] = 0;
- tg->last_io_disp[WRITE] = 0;
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
- unsigned long now;
- unsigned long last_finish_time = tg->last_finish_time;
-
- if (last_finish_time == 0)
- return;
-
- now = blk_time_get_ns() >> 10;
- if (now <= last_finish_time ||
- last_finish_time == tg->checked_last_finish_time)
- return;
-
- tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
- tg->checked_last_finish_time = last_finish_time;
-}
-
-static void throtl_update_latency_buckets(struct throtl_data *td)
-{
- struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
- int i, cpu, rw;
- unsigned long last_latency[2] = { 0 };
- unsigned long latency[2];
-
- if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
- return;
- if (time_before(jiffies, td->last_calculate_time + HZ))
- return;
- td->last_calculate_time = jiffies;
-
- memset(avg_latency, 0, sizeof(avg_latency));
- for (rw = READ; rw <= WRITE; rw++) {
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
-
- for_each_possible_cpu(cpu) {
- struct latency_bucket *bucket;
-
- /* this isn't race free, but ok in practice */
- bucket = per_cpu_ptr(td->latency_buckets[rw],
- cpu);
- tmp->total_latency += bucket[i].total_latency;
- tmp->samples += bucket[i].samples;
- bucket[i].total_latency = 0;
- bucket[i].samples = 0;
- }
-
- if (tmp->samples >= 32) {
- int samples = tmp->samples;
-
- latency[rw] = tmp->total_latency;
-
- tmp->total_latency = 0;
- tmp->samples = 0;
- latency[rw] /= samples;
- if (latency[rw] == 0)
- continue;
- avg_latency[rw][i].latency = latency[rw];
- }
- }
}
- for (rw = READ; rw <= WRITE; rw++) {
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- if (!avg_latency[rw][i].latency) {
- if (td->avg_buckets[rw][i].latency < last_latency[rw])
- td->avg_buckets[rw][i].latency =
- last_latency[rw];
- continue;
- }
-
- if (!td->avg_buckets[rw][i].valid)
- latency[rw] = avg_latency[rw][i].latency;
- else
- latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
- avg_latency[rw][i].latency) >> 3;
-
- td->avg_buckets[rw][i].latency = max(latency[rw],
- last_latency[rw]);
- td->avg_buckets[rw][i].valid = true;
- last_latency[rw] = td->avg_buckets[rw][i].latency;
- }
- }
-
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
- throtl_log(&td->service_queue,
- "Latency bucket %d: read latency=%ld, read valid=%d, "
- "write latency=%ld, write valid=%d", i,
- td->avg_buckets[READ][i].latency,
- td->avg_buckets[READ][i].valid,
- td->avg_buckets[WRITE][i].latency,
- td->avg_buckets[WRITE][i].valid);
-}
-#else
-static inline void throtl_update_latency_buckets(struct throtl_data *td)
-{
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
+ return tg_dispatch_time(tg, bio) == 0;
}
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg)
-{
- return false;
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
-}
-#endif
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -2185,51 +1759,42 @@ bool __blk_throtl_bio(struct bio *bio)
struct throtl_data *td = tg->td;
rcu_read_lock();
-
spin_lock_irq(&q->queue_lock);
-
- throtl_update_latency_buckets(td);
-
- blk_throtl_update_idletime(tg);
-
sq = &tg->service_queue;
-again:
while (true) {
- if (tg->last_low_overflow_time[rw] == 0)
- tg->last_low_overflow_time[rw] = jiffies;
- throtl_downgrade_check(tg);
- throtl_upgrade_check(tg);
- /* throtl is FIFO - if bios are already queued, should queue */
- if (sq->nr_queued[rw])
- break;
-
- /* if above limits, break to queue */
- if (!tg_may_dispatch(tg, bio, NULL)) {
- tg->last_low_overflow_time[rw] = jiffies;
- if (throtl_can_upgrade(td, tg)) {
- throtl_upgrade_state(td);
- goto again;
- }
+ if (tg_within_limit(tg, bio, rw)) {
+ /* within limits, let's charge and dispatch directly */
+ throtl_charge_iops_bio(tg, bio);
+
+ /*
+ * We need to trim slice even when bios are not being
+ * queued otherwise it might happen that a bio is not
+ * queued for a long time and slice keeps on extending
+ * and trim is not called for a long time. Now if limits
+ * are reduced suddenly we take into account all the IO
+ * dispatched so far at new low rate and * newly queued
+ * IO gets a really long dispatch time.
+ *
+ * So keep on trimming slice even if bio is not queued.
+ */
+ throtl_trim_slice(tg, rw);
+ } else if (bio_issue_as_root_blkg(bio)) {
+ /*
+ * IOs which may cause priority inversions are
+ * dispatched directly, even if they're over limit.
+ *
+ * Charge and dispatch directly, and our throttle
+ * control algorithm is adaptive, and extra IO bytes
+ * will be throttled for paying the debt
+ */
+ throtl_charge_bps_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
+ } else {
+ /* if above limits, break to queue */
break;
}
- /* within limits, let's charge and dispatch directly */
- throtl_charge_bio(tg, bio);
-
- /*
- * We need to trim slice even when bios are not being queued
- * otherwise it might happen that a bio is not queued for
- * a long time and slice keeps on extending and trim is not
- * called for a long time. Now if limits are reduced suddenly
- * we take into account all the IO dispatched so far at new
- * low rate and * newly queued IO gets a really long dispatch
- * time.
- *
- * So keep on trimming slice even if bio is not queued.
- */
- throtl_trim_slice(tg, rw);
-
/*
* @bio passed through this layer without being throttled.
* Climb up the ladder. If we're already at the top, it
@@ -2250,9 +1815,7 @@ again:
tg->bytes_disp[rw], bio->bi_iter.bi_size,
tg_bps_limit(tg, rw),
tg->io_disp[rw], tg_iops_limit(tg, rw),
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
-
- tg->last_low_overflow_time[rw] = jiffies;
+ sq_queued(sq, READ), sq_queued(sq, WRITE));
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
@@ -2260,225 +1823,37 @@ again:
/*
* Update @tg's dispatch time and force schedule dispatch if @tg
- * was empty before @bio. The forced scheduling isn't likely to
- * cause undue delay as @bio is likely to be dispatched directly if
- * its @tg's disptime is not in the future.
+ * was empty before @bio, or the iops queue is empty and @bio will
+ * add to. The forced scheduling isn't likely to cause undue
+ * delay as @bio is likely to be dispatched directly if its @tg's
+ * disptime is not in the future.
*/
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}
out_unlock:
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- if (throttled || !td->track_bio_latency)
- bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
-#endif
spin_unlock_irq(&q->queue_lock);
rcu_read_unlock();
return throttled;
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void throtl_track_latency(struct throtl_data *td, sector_t size,
- enum req_op op, unsigned long time)
-{
- const bool rw = op_is_write(op);
- struct latency_bucket *latency;
- int index;
-
- if (!td || td->limit_index != LIMIT_LOW ||
- !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
- !blk_queue_nonrot(td->queue))
- return;
-
- index = request_bucket_index(size);
-
- latency = get_cpu_ptr(td->latency_buckets[rw]);
- latency[index].total_latency += time;
- latency[index].samples++;
- put_cpu_ptr(td->latency_buckets[rw]);
-}
-
-void blk_throtl_stat_add(struct request *rq, u64 time_ns)
-{
- struct request_queue *q = rq->q;
- struct throtl_data *td = q->td;
-
- throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
- time_ns >> 10);
-}
-
-void blk_throtl_bio_endio(struct bio *bio)
-{
- struct blkcg_gq *blkg;
- struct throtl_grp *tg;
- u64 finish_time_ns;
- unsigned long finish_time;
- unsigned long start_time;
- unsigned long lat;
- int rw = bio_data_dir(bio);
-
- blkg = bio->bi_blkg;
- if (!blkg)
- return;
- tg = blkg_to_tg(blkg);
- if (!tg->td->limit_valid[LIMIT_LOW])
- return;
-
- finish_time_ns = blk_time_get_ns();
- tg->last_finish_time = finish_time_ns >> 10;
-
- start_time = bio_issue_time(&bio->bi_issue) >> 10;
- finish_time = __bio_issue_time(finish_time_ns) >> 10;
- if (!start_time || finish_time <= start_time)
- return;
-
- lat = finish_time - start_time;
- /* this is only for bio based driver */
- if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
- throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
- bio_op(bio), lat);
-
- if (tg->latency_target && lat >= tg->td->filtered_latency) {
- int bucket;
- unsigned int threshold;
-
- bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
- threshold = tg->td->avg_buckets[rw][bucket].latency +
- tg->latency_target;
- if (lat > threshold)
- tg->bad_bio_cnt++;
- /*
- * Not race free, could get wrong count, which means cgroups
- * will be throttled
- */
- tg->bio_cnt++;
- }
-
- if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
- tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
- tg->bio_cnt /= 2;
- tg->bad_bio_cnt /= 2;
- }
-}
-#endif
-
-int blk_throtl_init(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
- struct throtl_data *td;
- int ret;
-
- td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
- if (!td)
- return -ENOMEM;
- td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
- LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets[READ]) {
- kfree(td);
- return -ENOMEM;
- }
- td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
- LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets[WRITE]) {
- free_percpu(td->latency_buckets[READ]);
- kfree(td);
- return -ENOMEM;
- }
-
- INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
- throtl_service_queue_init(&td->service_queue);
-
- q->td = td;
- td->queue = q;
-
- td->limit_valid[LIMIT_MAX] = true;
- td->limit_index = LIMIT_MAX;
- td->low_upgrade_time = jiffies;
- td->low_downgrade_time = jiffies;
-
- /* activate policy */
- ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
- if (ret) {
- free_percpu(td->latency_buckets[READ]);
- free_percpu(td->latency_buckets[WRITE]);
- kfree(td);
- }
- return ret;
-}
-
void blk_throtl_exit(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- BUG_ON(!q->td);
- del_timer_sync(&q->td->service_queue.pending_timer);
+ if (!blk_throtl_activated(q))
+ return;
+
+ timer_delete_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
- free_percpu(q->td->latency_buckets[READ]);
- free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
-void blk_throtl_register(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
- struct throtl_data *td;
- int i;
-
- td = q->td;
- BUG_ON(!td);
-
- if (blk_queue_nonrot(q)) {
- td->throtl_slice = DFL_THROTL_SLICE_SSD;
- td->filtered_latency = LATENCY_FILTERED_SSD;
- } else {
- td->throtl_slice = DFL_THROTL_SLICE_HD;
- td->filtered_latency = LATENCY_FILTERED_HD;
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
- td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
- }
- }
-#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
- /* if no low limit, use previous default */
- td->throtl_slice = DFL_THROTL_SLICE_HD;
-
-#else
- td->track_bio_latency = !queue_is_mq(q);
- if (!td->track_bio_latency)
- blk_stat_enable_accounting(q);
-#endif
-}
-
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
-{
- if (!q->td)
- return -EINVAL;
- return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
-}
-
-ssize_t blk_throtl_sample_time_store(struct request_queue *q,
- const char *page, size_t count)
-{
- unsigned long v;
- unsigned long t;
-
- if (!q->td)
- return -EINVAL;
- if (kstrtoul(page, 10, &v))
- return -EINVAL;
- t = msecs_to_jiffies(v);
- if (t == 0 || t > MAX_THROTL_SLICE)
- return -EINVAL;
- q->td->throtl_slice = t;
- return count;
-}
-#endif
-
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index bffbc9cfc8ab..3b27755bfbff 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H
@@ -28,7 +29,8 @@
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
- struct bio_list bios; /* queued bios */
+ struct bio_list bios_bps; /* queued bios for bps limit */
+ struct bio_list bios_iops; /* queued bios for iops limit */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
@@ -40,7 +42,8 @@ struct throtl_service_queue {
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
- unsigned int nr_queued[2]; /* number of queued bios */
+ unsigned int nr_queued_bps[2]; /* number of queued bps bios */
+ unsigned int nr_queued_iops[2]; /* number of queued iops bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
@@ -53,15 +56,14 @@ struct throtl_service_queue {
};
enum tg_state_flags {
- THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
- THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
- THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
-};
-
-enum {
- LIMIT_LOW,
- LIMIT_MAX,
- LIMIT_CNT,
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+ /*
+ * The sq's iops queue is empty, and a bio is about to be enqueued
+ * to the first qnode's bios_iops list.
+ */
+ THROTL_TG_IOPS_WAS_EMPTY = 1 << 2,
+ THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
};
struct throtl_grp {
@@ -101,53 +103,29 @@ struct throtl_grp {
bool has_rules_bps[2];
bool has_rules_iops[2];
- /* internally used bytes per second rate limits */
- uint64_t bps[2][LIMIT_CNT];
- /* user configured bps limits */
- uint64_t bps_conf[2][LIMIT_CNT];
+ /* bytes per second rate limits */
+ uint64_t bps[2];
- /* internally used IOPS limits */
- unsigned int iops[2][LIMIT_CNT];
- /* user configured IOPS limits */
- unsigned int iops_conf[2][LIMIT_CNT];
-
- /* Number of bytes dispatched in current slice */
- uint64_t bytes_disp[2];
- /* Number of bio's dispatched in current slice */
- unsigned int io_disp[2];
-
- unsigned long last_low_overflow_time[2];
-
- uint64_t last_bytes_disp[2];
- unsigned int last_io_disp[2];
+ /* IOPS limits */
+ unsigned int iops[2];
/*
- * The following two fields are updated when new configuration is
- * submitted while some bios are still throttled, they record how many
- * bytes/ios are waited already in previous configuration, and they will
- * be used to calculate wait time under new configuration.
+ * Number of bytes/bio's dispatched in current slice.
+ * When new configuration is submitted while some bios are still throttled,
+ * first calculate the carryover: the amount of bytes/IOs already waited
+ * under the previous configuration. Then, [bytes/io]_disp are represented
+ * as the negative of the carryover, and they will be used to calculate the
+ * wait time under the new configuration.
*/
- long long carryover_bytes[2];
- int carryover_ios[2];
+ int64_t bytes_disp[2];
+ int io_disp[2];
unsigned long last_check_time;
- unsigned long latency_target; /* us */
- unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
- unsigned long last_finish_time; /* ns / 1024 */
- unsigned long checked_last_finish_time; /* ns / 1024 */
- unsigned long avg_idletime; /* ns / 1024 */
- unsigned long idletime_threshold; /* us */
- unsigned long idletime_threshold_conf; /* us */
-
- unsigned int bio_cnt; /* total bios */
- unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
- unsigned long bio_cnt_reset_time;
-
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
@@ -168,23 +146,33 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
* Internal throttling interface
*/
#ifndef CONFIG_BLK_DEV_THROTTLING
-static inline int blk_throtl_init(struct gendisk *disk) { return 0; }
static inline void blk_throtl_exit(struct gendisk *disk) { }
-static inline void blk_throtl_register(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
-int blk_throtl_init(struct gendisk *disk);
void blk_throtl_exit(struct gendisk *disk);
-void blk_throtl_register(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct gendisk *disk);
+static inline bool blk_throtl_activated(struct request_queue *q)
+{
+ return q->td != NULL;
+}
+
static inline bool blk_should_throtl(struct bio *bio)
{
- struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+ struct throtl_grp *tg;
int rw = bio_data_dir(bio);
+ /*
+ * This is called under bio_queue_enter(), and it's synchronized with
+ * the activation of blk-throtl, which is protected by
+ * blk_mq_freeze_queue().
+ */
+ if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
+ return false;
+
+ tg = blkg_to_tg(bio->bi_blkg);
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
bio_set_flag(bio, BIO_CGROUP_ACCT);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 64472134dd26..a50d4cd55f41 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -37,7 +37,7 @@
enum wbt_flags {
WBT_TRACKED = 1, /* write, tracked for throttling */
WBT_READ = 2, /* read */
- WBT_KSWAPD = 4, /* write, from kswapd */
+ WBT_SWAP = 4, /* write, from swap_writeout() */
WBT_DISCARD = 8, /* discard */
WBT_NR_BITS = 4, /* number of bits */
@@ -45,7 +45,7 @@ enum wbt_flags {
enum {
WBT_RWQ_BG = 0,
- WBT_RWQ_KSWAPD,
+ WBT_RWQ_SWAP,
WBT_RWQ_DISCARD,
WBT_NUM_RWQ,
};
@@ -136,8 +136,9 @@ enum {
RWB_MIN_WRITE_SAMPLES = 3,
/*
- * If we have this number of consecutive windows with not enough
- * information to scale up or down, scale up.
+ * If we have this number of consecutive windows without enough
+ * information to scale up or down, slowly return to center state
+ * (step == 0).
*/
RWB_UNKNOWN_BUMP = 5,
};
@@ -172,8 +173,8 @@ static bool wb_recent_wait(struct rq_wb *rwb)
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
enum wbt_flags wb_acct)
{
- if (wb_acct & WBT_KSWAPD)
- return &rwb->rq_wait[WBT_RWQ_KSWAPD];
+ if (wb_acct & WBT_SWAP)
+ return &rwb->rq_wait[WBT_RWQ_SWAP];
else if (wb_acct & WBT_DISCARD)
return &rwb->rq_wait[WBT_RWQ_DISCARD];
@@ -206,8 +207,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
*/
if (wb_acct & WBT_DISCARD)
limit = rwb->wb_background;
- else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) &&
- !wb_recent_wait(rwb))
+ else if (blk_queue_write_cache(rwb->rqos.disk->queue) &&
+ !wb_recent_wait(rwb))
limit = 0;
else
limit = rwb->wb_normal;
@@ -446,9 +447,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
break;
case LAT_UNKNOWN_WRITES:
/*
- * We started a the center step, but don't have a valid
- * read/write sample, but we do have writes going on.
- * Allow step to go negative, to increase write perf.
+ * We don't have a valid read/write sample, but we do have
+ * writes going on. Allow step to go negative, to increase
+ * write performance.
*/
scale_up(rwb);
break;
@@ -528,7 +529,7 @@ static bool close_io(struct rq_wb *rwb)
time_before(now, rwb->last_comp + HZ / 10);
}
-#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO | REQ_SWAP)
static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
{
@@ -539,13 +540,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
/*
* At this point we know it's a buffered write. If this is
- * kswapd trying to free memory, or REQ_SYNC is set, then
+ * swap trying to free memory, or REQ_SYNC is set, then
* it's WB_SYNC_ALL writeback, and we'll use the max limit for
* that. If the write is marked as a background write, then use
* the idle limit, or go to normal if we haven't had competing
* IO for a bit.
*/
- if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb))
limit = rwb->rq_depth.max_depth;
else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
/*
@@ -622,8 +623,8 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
if (bio_op(bio) == REQ_OP_READ) {
flags = WBT_READ;
} else if (wbt_should_throttle(bio)) {
- if (current_is_kswapd())
- flags |= WBT_KSWAPD;
+ if (bio->bi_opf & REQ_SWAP)
+ flags |= WBT_SWAP;
if (bio_op(bio) == REQ_OP_DISCARD)
flags |= WBT_DISCARD;
flags |= WBT_TRACKED;
@@ -638,11 +639,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
__wbt_done(rqos, flags);
}
-/*
- * May sleep, if we have exceeded the writeback limits. Caller can pass
- * in an irq held spinlock, if it holds one when calling this function.
- * If we do sleep, we'll release and re-grab it.
- */
+/* May sleep, if we have exceeded the writeback limits. */
static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
{
struct rq_wb *rwb = RQWB(rqos);
@@ -707,8 +704,9 @@ void wbt_enable_default(struct gendisk *disk)
struct rq_qos *rqos;
bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
- if (q->elevator &&
- test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
+ mutex_lock(&disk->rqos_state_mutex);
+
+ if (blk_queue_disable_wbt(q))
enable = false;
/* Throttling already enabled? */
@@ -716,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk)
if (rqos) {
if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
+ mutex_unlock(&disk->rqos_state_mutex);
return;
}
+ mutex_unlock(&disk->rqos_state_mutex);
/* Queue not registered? Maybe shutting down... */
if (!blk_queue_registered(q))
@@ -777,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk)
struct rq_wb *rwb;
if (!rqos)
return;
+ mutex_lock(&disk->rqos_state_mutex);
rwb = RQWB(rqos);
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
blk_stat_deactivate(rwb->cb);
rwb->enable_state = WBT_STATE_OFF_DEFAULT;
}
+ mutex_unlock(&disk->rqos_state_mutex);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da0f4b2a8fa0..8f15d1aa6eb8 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -7,17 +7,19 @@
*
* Copyright (c) 2016, Damien Le Moal
* Copyright (c) 2016, Western Digital
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
*/
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/mempool.h>
#include "blk.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
static const char *const zone_cond_name[] = {
@@ -32,6 +34,60 @@ static const char *const zone_cond_name[] = {
};
#undef ZONE_COND_NAME
+/*
+ * Per-zone write plug.
+ * @node: hlist_node structure for managing the plug using a hash table.
+ * @ref: Zone write plug reference counter. A zone write plug reference is
+ * always at least 1 when the plug is hashed in the disk plug hash table.
+ * The reference is incremented whenever a new BIO needing plugging is
+ * submitted and when a function needs to manipulate a plug. The
+ * reference count is decremented whenever a plugged BIO completes and
+ * when a function that referenced the plug returns. The initial
+ * reference is dropped whenever the zone of the zone write plug is reset,
+ * finished and when the zone becomes full (last write BIO to the zone
+ * completes).
+ * @lock: Spinlock to atomically manipulate the plug.
+ * @flags: Flags indicating the plug state.
+ * @zone_no: The number of the zone the plug is managing.
+ * @wp_offset: The zone write pointer location relative to the start of the zone
+ * as a number of 512B sectors.
+ * @bio_list: The list of BIOs that are currently plugged.
+ * @bio_work: Work struct to handle issuing of plugged BIOs
+ * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
+ */
+struct blk_zone_wplug {
+ struct hlist_node node;
+ refcount_t ref;
+ spinlock_t lock;
+ unsigned int flags;
+ unsigned int zone_no;
+ unsigned int wp_offset;
+ struct bio_list bio_list;
+ struct work_struct bio_work;
+ struct rcu_head rcu_head;
+ struct gendisk *disk;
+};
+
+/*
+ * Zone write plug flags bits:
+ * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
+ * that is, that write BIOs are being throttled due to a write BIO already
+ * being executed or the zone write plug bio list is not empty.
+ * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
+ * write pointer offset and need to update it.
+ * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
+ * from the disk hash table and that the initial reference to the zone
+ * write plug set when the plug was first added to the hash table has been
+ * dropped. This flag is set when a zone is reset, finished or become full,
+ * to prevent new references to the zone write plug to be taken for
+ * newly incoming BIOs. A zone write plug flagged with this flag will be
+ * freed once all remaining references from BIOs or functions are dropped.
+ */
+#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
+#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
+#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
+
/**
* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
* @zone_cond: BLK_ZONE_COND_XXX.
@@ -51,69 +107,29 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
}
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
-/*
- * Return true if a request is a write requests that needs zone write locking.
- */
-bool blk_req_needs_zone_write_lock(struct request *rq)
-{
- if (!rq->q->disk->seq_zones_wlock)
- return false;
-
- return blk_rq_is_seq_zoned_write(rq);
-}
-EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
-
-bool blk_req_zone_write_trylock(struct request *rq)
-{
- unsigned int zno = blk_rq_zone_no(rq);
-
- if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
- return false;
-
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
-
-void __blk_req_zone_write_lock(struct request *rq)
-{
- if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock)))
- return;
+struct disk_report_zones_cb_args {
+ struct gendisk *disk;
+ report_zones_cb user_cb;
+ void *user_data;
+};
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
+ struct blk_zone *zone);
-void __blk_req_zone_write_unlock(struct request *rq)
+static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
{
- rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->disk->seq_zones_wlock)
- WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock));
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+ struct disk_report_zones_cb_args *args = data;
+ struct gendisk *disk = args->disk;
-/**
- * bdev_nr_zones - Get number of zones
- * @bdev: Target device
- *
- * Return the total number of zones of a zoned block device. For a block
- * device without zone capabilities, the number of zones is always 0.
- */
-unsigned int bdev_nr_zones(struct block_device *bdev)
-{
- sector_t zone_sectors = bdev_zone_sectors(bdev);
+ if (disk->zone_wplugs_hash)
+ disk_zone_wplug_sync_wp_offset(disk, zone);
- if (!bdev_is_zoned(bdev))
+ if (!args->user_cb)
return 0;
- return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
- ilog2(zone_sectors);
+
+ return args->user_cb(zone, idx, args->user_data);
}
-EXPORT_SYMBOL_GPL(bdev_nr_zones);
/**
* blkdev_report_zones - Get zones information
@@ -139,6 +155,11 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
{
struct gendisk *disk = bdev->bd_disk;
sector_t capacity = get_capacity(disk);
+ struct disk_report_zones_cb_args args = {
+ .disk = disk,
+ .user_cb = cb,
+ .user_data = data,
+ };
if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
return -EOPNOTSUPP;
@@ -146,81 +167,11 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
if (!nr_zones || sector >= capacity)
return 0;
- return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
+ return disk->fops->report_zones(disk, sector, nr_zones,
+ disk_report_zones_cb, &args);
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);
-static inline unsigned long *blk_alloc_zone_bitmap(int node,
- unsigned int nr_zones)
-{
- return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
- GFP_NOIO, node);
-}
-
-static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
-{
- /*
- * For an all-zones reset, ignore conventional, empty, read-only
- * and offline zones.
- */
- switch (zone->cond) {
- case BLK_ZONE_COND_NOT_WP:
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_READONLY:
- case BLK_ZONE_COND_OFFLINE:
- return 0;
- default:
- set_bit(idx, (unsigned long *)data);
- return 0;
- }
-}
-
-static int blkdev_zone_reset_all_emulated(struct block_device *bdev)
-{
- struct gendisk *disk = bdev->bd_disk;
- sector_t capacity = bdev_nr_sectors(bdev);
- sector_t zone_sectors = bdev_zone_sectors(bdev);
- unsigned long *need_reset;
- struct bio *bio = NULL;
- sector_t sector = 0;
- int ret;
-
- need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
- if (!need_reset)
- return -ENOMEM;
-
- ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
- blk_zone_need_reset_cb, need_reset);
- if (ret < 0)
- goto out_free_need_reset;
-
- ret = 0;
- while (sector < capacity) {
- if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
- sector += zone_sectors;
- continue;
- }
-
- bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
- GFP_KERNEL);
- bio->bi_iter.bi_sector = sector;
- sector += zone_sectors;
-
- /* This may take a while, so be nice to others */
- cond_resched();
- }
-
- if (bio) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
- }
-
-out_free_need_reset:
- kfree(need_reset);
- return ret;
-}
-
static int blkdev_zone_reset_all(struct block_device *bdev)
{
struct bio bio;
@@ -247,7 +198,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev)
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
sector_t sector, sector_t nr_sectors)
{
- struct request_queue *q = bdev_get_queue(bdev);
sector_t zone_sectors = bdev_zone_sectors(bdev);
sector_t capacity = bdev_nr_sectors(bdev);
sector_t end_sector = sector + nr_sectors;
@@ -275,16 +225,11 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
return -EINVAL;
/*
- * In the case of a zone reset operation over all zones,
- * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
- * command. For other devices, we emulate this command behavior by
- * identifying the zones needing a reset.
+ * In the case of a zone reset operation over all zones, use
+ * REQ_OP_ZONE_RESET_ALL.
*/
- if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
- if (!blk_queue_zone_resetall(q))
- return blkdev_zone_reset_all_emulated(bdev);
+ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
return blkdev_zone_reset_all(bdev);
- }
while (sector < end_sector) {
bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
@@ -398,7 +343,8 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
op = REQ_OP_ZONE_RESET;
/* Invalidate the page cache, including dirty pages. */
- filemap_invalidate_lock(bdev->bd_inode->i_mapping);
+ inode_lock(bdev->bd_mapping->host);
+ filemap_invalidate_lock(bdev->bd_mapping);
ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
if (ret)
goto fail;
@@ -419,29 +365,1279 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
- if (cmd == BLKRESETZONE)
- filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
+ if (cmd == BLKRESETZONE) {
+ filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
+ }
return ret;
}
-void disk_free_zone_bitmaps(struct gendisk *disk)
+static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
+{
+ return zone->start + zone->len >= get_capacity(disk);
+}
+
+static bool disk_zone_is_full(struct gendisk *disk,
+ unsigned int zno, unsigned int offset_in_zone)
+{
+ if (zno < disk->nr_zones - 1)
+ return offset_in_zone >= disk->zone_capacity;
+ return offset_in_zone >= disk->last_zone_capacity;
+}
+
+static bool disk_zone_wplug_is_full(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
+}
+
+static bool disk_insert_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ struct blk_zone_wplug *zwplg;
+ unsigned long flags;
+ unsigned int idx =
+ hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
+
+ /*
+ * Add the new zone write plug to the hash table, but carefully as we
+ * are racing with other submission context, so we may already have a
+ * zone write plug for the same zone.
+ */
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplg->zone_no == zwplug->zone_no) {
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ return false;
+ }
+ }
+ hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+ atomic_inc(&disk->nr_zone_wplugs);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ return true;
+}
+
+static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
+ sector_t sector)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
+ struct blk_zone_wplug *zwplug;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
+ if (zwplug->zone_no == zno &&
+ refcount_inc_not_zero(&zwplug->ref)) {
+ rcu_read_unlock();
+ return zwplug;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+ sector_t sector)
+{
+ if (!atomic_read(&disk->nr_zone_wplugs))
+ return NULL;
+
+ return disk_get_hashed_zone_wplug(disk, sector);
+}
+
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
+{
+ struct blk_zone_wplug *zwplug =
+ container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+ mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
+}
+
+static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+ if (refcount_dec_and_test(&zwplug->ref)) {
+ WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
+ WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
+ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
+
+ call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
+ }
+}
+
+static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ lockdep_assert_held(&zwplug->lock);
+
+ /* If the zone write plug was already removed, we are done. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+ return false;
+
+ /* If the zone write plug is still plugged, it cannot be removed. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
+ return false;
+
+ /*
+ * Completions of BIOs with blk_zone_write_plug_bio_endio() may
+ * happen after handling a request completion with
+ * blk_zone_write_plug_finish_request() (e.g. with split BIOs
+ * that are chained). In such case, disk_zone_wplug_unplug_bio()
+ * should not attempt to remove the zone write plug until all BIO
+ * completions are seen. Check by looking at the zone write plug
+ * reference count, which is 2 when the plug is unused (one reference
+ * taken when the plug was allocated and another reference taken by the
+ * caller context).
+ */
+ if (refcount_read(&zwplug->ref) > 2)
+ return false;
+
+ /* We can remove zone write plugs for zones that are empty or full. */
+ return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
+}
+
+static void disk_remove_zone_wplug(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ /* If the zone write plug was already removed, we have nothing to do. */
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+ return;
+
+ /*
+ * Mark the zone write plug as unhashed and drop the extra reference we
+ * took when the plug was inserted in the hash table.
+ */
+ zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ hlist_del_init_rcu(&zwplug->node);
+ atomic_dec(&disk->nr_zone_wplugs);
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+ disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work);
+
+/*
+ * Get a reference on the write plug for the zone containing @sector.
+ * If the plug does not exist, it is allocated and hashed.
+ * Return a pointer to the zone write plug with the plug spinlock held.
+ */
+static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
+ sector_t sector, gfp_t gfp_mask,
+ unsigned long *flags)
+{
+ unsigned int zno = disk_zone_no(disk, sector);
+ struct blk_zone_wplug *zwplug;
+
+again:
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ /*
+ * Check that a BIO completion or a zone reset or finish
+ * operation has not already removed the zone write plug from
+ * the hash table and dropped its reference count. In such case,
+ * we need to get a new plug so start over from the beginning.
+ */
+ spin_lock_irqsave(&zwplug->lock, *flags);
+ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ disk_put_zone_wplug(zwplug);
+ goto again;
+ }
+ return zwplug;
+ }
+
+ /*
+ * Allocate and initialize a zone write plug with an extra reference
+ * so that it is not freed when the zone write plug becomes idle without
+ * the zone being full.
+ */
+ zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
+ if (!zwplug)
+ return NULL;
+
+ INIT_HLIST_NODE(&zwplug->node);
+ refcount_set(&zwplug->ref, 2);
+ spin_lock_init(&zwplug->lock);
+ zwplug->flags = 0;
+ zwplug->zone_no = zno;
+ zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
+ bio_list_init(&zwplug->bio_list);
+ INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+ zwplug->disk = disk;
+
+ spin_lock_irqsave(&zwplug->lock, *flags);
+
+ /*
+ * Insert the new zone write plug in the hash table. This can fail only
+ * if another context already inserted a plug. Retry from the beginning
+ * in such case.
+ */
+ if (!disk_insert_zone_wplug(disk, zwplug)) {
+ spin_unlock_irqrestore(&zwplug->lock, *flags);
+ mempool_free(zwplug, disk->zone_wplugs_pool);
+ goto again;
+ }
+
+ return zwplug;
+}
+
+static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
+ struct bio *bio)
+{
+ struct request_queue *q = zwplug->disk->queue;
+
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+ bio_io_error(bio);
+ disk_put_zone_wplug(zwplug);
+ /* Drop the reference taken by disk_zone_wplug_add_bio(() */
+ blk_queue_exit(q);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug.
+ */
+static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
+{
+ struct bio *bio;
+
+ if (bio_list_empty(&zwplug->bio_list))
+ return;
+
+ pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
+ zwplug->disk->disk_name, zwplug->zone_no);
+ while ((bio = bio_list_pop(&zwplug->bio_list)))
+ blk_zone_wplug_bio_io_error(zwplug, bio);
+}
+
+/*
+ * Set a zone write plug write pointer offset to the specified value.
+ * This aborts all plugged BIOs, which is fine as this function is called for
+ * a zone reset operation, a zone finish operation or if the zone needs a wp
+ * update from a report zone after a write error.
+ */
+static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug,
+ unsigned int wp_offset)
+{
+ lockdep_assert_held(&zwplug->lock);
+
+ /* Update the zone write pointer and abort all plugged BIOs. */
+ zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
+ zwplug->wp_offset = wp_offset;
+ disk_zone_wplug_abort(zwplug);
+
+ /*
+ * The zone write plug now has no BIO plugged: remove it from the
+ * hash table so that it cannot be seen. The plug will be freed
+ * when the last reference is dropped.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+}
+
+static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
+{
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return zone->wp - zone->start;
+ case BLK_ZONE_COND_FULL:
+ return zone->len;
+ case BLK_ZONE_COND_EMPTY:
+ return 0;
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ /*
+ * Conventional, offline and read-only zones do not have a valid
+ * write pointer.
+ */
+ return UINT_MAX;
+ }
+}
+
+static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
+ struct blk_zone *zone)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ zwplug = disk_get_zone_wplug(disk, zone->start);
+ if (!zwplug)
+ return;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
+ disk_zone_wplug_set_wp_offset(disk, zwplug,
+ blk_zone_wp_offset(zone));
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ disk_put_zone_wplug(zwplug);
+}
+
+static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector)
+{
+ struct disk_report_zones_cb_args args = {
+ .disk = disk,
+ };
+
+ return disk->fops->report_zones(disk, sector, 1,
+ disk_report_zones_cb, &args);
+}
+
+static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
+ unsigned int wp_offset)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /* Conventional zones cannot be reset nor finished. */
+ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /*
+ * No-wait reset or finish BIOs do not make much sense as the callers
+ * issue these as blocking operations in most cases. To avoid issues
+ * the BIO execution potentially failing with BLK_STS_AGAIN, warn about
+ * REQ_NOWAIT being set and ignore that flag.
+ */
+ if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
+ bio->bi_opf &= ~REQ_NOWAIT;
+
+ /*
+ * If we have a zone write plug, set its write pointer offset to 0
+ * (reset case) or to the zone size (finish case). This will abort all
+ * BIOs plugged for the target zone. It is fine as resetting or
+ * finishing zones while writes are still in-flight will result in the
+ * writes failing anyway.
+ */
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
+ }
+
+ return false;
+}
+
+static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+ sector_t sector;
+
+ /*
+ * Set the write pointer offset of all zone write plugs to 0. This will
+ * abort all plugged BIOs. It is fine as resetting zones while writes
+ * are still in-flight will result in the writes failing anyway.
+ */
+ for (sector = 0; sector < get_capacity(disk);
+ sector += disk->queue->limits.chunk_sectors) {
+ zwplug = disk_get_zone_wplug(disk, sector);
+ if (zwplug) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ return false;
+}
+
+static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ /*
+ * Take a reference on the zone write plug and schedule the submission
+ * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
+ * reference we take here.
+ */
+ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+ refcount_inc(&zwplug->ref);
+ queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+}
+
+static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug,
+ struct bio *bio, unsigned int nr_segs)
+{
+ bool schedule_bio_work = false;
+
+ /*
+ * Grab an extra reference on the BIO request queue usage counter.
+ * This reference will be reused to submit a request for the BIO for
+ * blk-mq devices and dropped when the BIO is failed and after
+ * it is issued in the case of BIO-based devices.
+ */
+ percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
+
+ /*
+ * The BIO is being plugged and thus will have to wait for the on-going
+ * write and for all other writes already plugged. So polling makes
+ * no sense.
+ */
+ bio_clear_polled(bio);
+
+ /*
+ * REQ_NOWAIT BIOs are always handled using the zone write plug BIO
+ * work, which can block. So clear the REQ_NOWAIT flag and schedule the
+ * work if this is the first BIO we are plugging.
+ */
+ if (bio->bi_opf & REQ_NOWAIT) {
+ schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
+ bio->bi_opf &= ~REQ_NOWAIT;
+ }
+
+ /*
+ * Reuse the poll cookie field to store the number of segments when
+ * split to the hardware limits.
+ */
+ bio->__bi_nr_segments = nr_segs;
+
+ /*
+ * We always receive BIOs after they are split and ready to be issued.
+ * The block layer passes the parts of a split BIO in order, and the
+ * user must also issue write sequentially. So simply add the new BIO
+ * at the tail of the list to preserve the sequential write order.
+ */
+ bio_list_add(&zwplug->bio_list, bio);
+
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+
+ if (schedule_bio_work)
+ disk_zone_wplug_schedule_bio_work(disk, zwplug);
+}
+
+/*
+ * Called from bio_attempt_back_merge() when a BIO was merged with a request.
+ */
+void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /*
+ * If the BIO was already plugged, then we were called through
+ * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
+ * For this case, we already hold a reference on the zone write plug for
+ * the BIO and blk_zone_write_plug_init_request() will handle the
+ * zone write pointer offset update.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return;
+
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * Get a reference on the zone write plug of the target zone and advance
+ * the zone write pointer offset. Given that this is a merge, we already
+ * have at least one request and one BIO referencing the zone write
+ * plug. So this should not fail.
+ */
+ zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+ bio->bi_iter.bi_sector);
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwplug->wp_offset += bio_sectors(bio);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
+ * already went through zone write plugging (either a new BIO or one that was
+ * unplugged).
+ */
+void blk_zone_write_plug_init_request(struct request *req)
+{
+ sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
+ struct request_queue *q = req->q;
+ struct gendisk *disk = q->disk;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(disk, blk_rq_pos(req));
+ unsigned long flags;
+ struct bio *bio;
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ /*
+ * Indicate that completion of this request needs to be handled with
+ * blk_zone_write_plug_finish_request(), which will drop the reference
+ * on the zone write plug we took above on entry to this function.
+ */
+ req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
+
+ if (blk_queue_nomerges(q))
+ return;
+
+ /*
+ * Walk through the list of plugged BIOs to check if they can be merged
+ * into the back of the request.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+ while (!disk_zone_wplug_is_full(disk, zwplug)) {
+ bio = bio_list_peek(&zwplug->bio_list);
+ if (!bio)
+ break;
+
+ if (bio->bi_iter.bi_sector != req_back_sector ||
+ !blk_rq_merge_ok(req, bio))
+ break;
+
+ WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+ !bio->__bi_nr_segments);
+
+ bio_list_pop(&zwplug->bio_list);
+ if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
+ BIO_MERGE_OK) {
+ bio_list_add_head(&zwplug->bio_list, bio);
+ break;
+ }
+
+ /* Drop the reference taken by disk_zone_wplug_add_bio(). */
+ blk_queue_exit(q);
+ zwplug->wp_offset += bio_sectors(bio);
+
+ req_back_sector += bio_sectors(bio);
+ }
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Check and prepare a BIO for submission by incrementing the write pointer
+ * offset of its zone write plug and changing zone append operations into
+ * regular write when zone append emulation is needed.
+ */
+static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
+ struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+ lockdep_assert_held(&zwplug->lock);
+
+ /*
+ * If we lost track of the zone write pointer due to a write error,
+ * the user must either execute a report zones, reset the zone or finish
+ * the to recover a reliable write pointer position. Fail BIOs if the
+ * user did not do that as we cannot handle emulated zone append
+ * otherwise.
+ */
+ if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
+ return false;
+
+ /*
+ * Check that the user is not attempting to write to a full zone.
+ * We know such BIO will fail, and that would potentially overflow our
+ * write pointer offset beyond the end of the zone.
+ */
+ if (disk_zone_wplug_is_full(disk, zwplug))
+ return false;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ /*
+ * Use a regular write starting at the current write pointer.
+ * Similarly to native zone append operations, do not allow
+ * merging.
+ */
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
+ bio->bi_iter.bi_sector += zwplug->wp_offset;
+
+ /*
+ * Remember that this BIO is in fact a zone append operation
+ * so that we can restore its operation code on completion.
+ */
+ bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
+ } else {
+ /*
+ * Check for non-sequential writes early as we know that BIOs
+ * with a start sector not unaligned to the zone write pointer
+ * will fail.
+ */
+ if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
+ return false;
+ }
+
+ /* Advance the zone write pointer offset. */
+ zwplug->wp_offset += bio_sectors(bio);
+
+ return true;
+}
+
+static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ sector_t sector = bio->bi_iter.bi_sector;
+ struct blk_zone_wplug *zwplug;
+ gfp_t gfp_mask = GFP_NOIO;
+ unsigned long flags;
+
+ /*
+ * BIOs must be fully contained within a zone so that we use the correct
+ * zone write plug for the entire BIO. For blk-mq devices, the block
+ * layer should already have done any splitting required to ensure this
+ * and this BIO should thus not be straddling zone boundaries. For
+ * BIO-based devices, it is the responsibility of the driver to split
+ * the bio before submitting it.
+ */
+ if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Conventional zones do not need write plugging. */
+ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
+ /* Zone append to conventional zones is not allowed. */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ bio_io_error(bio);
+ return true;
+ }
+ return false;
+ }
+
+ if (bio->bi_opf & REQ_NOWAIT)
+ gfp_mask = GFP_NOWAIT;
+
+ zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+ if (!zwplug) {
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ return true;
+ }
+
+ /* Indicate that this BIO is being handled using zone write plugging. */
+ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If the zone is already plugged, add the BIO to the plug BIO list.
+ * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a
+ * BLK_STS_AGAIN failure if we let the BIO execute.
+ * Otherwise, plug and let the BIO execute.
+ */
+ if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) ||
+ (bio->bi_opf & REQ_NOWAIT))
+ goto plug;
+
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ bio_io_error(bio);
+ return true;
+ }
+
+ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return false;
+
+plug:
+ disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ return true;
+}
+
+static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned long flags;
+
+ /*
+ * We have native support for zone append operations, so we are not
+ * going to handle @bio through plugging. However, we may already have a
+ * zone write plug for the target zone if that zone was previously
+ * partially written using regular writes. In such case, we risk leaving
+ * the plug in the disk hash table if the zone is fully written using
+ * zone append operations. Avoid this by removing the zone write plug.
+ */
+ zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+ if (likely(!zwplug))
+ return;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /*
+ * We are about to remove the zone write plug. But if the user
+ * (mistakenly) has issued regular writes together with native zone
+ * append, we must aborts the writes as otherwise the plugged BIOs would
+ * not be executed by the plug BIO work as disk_get_zone_wplug() will
+ * return NULL after the plug is removed. Aborting the plugged write
+ * BIOs is consistent with the fact that these writes will most likely
+ * fail anyway as there is no ordering guarantees between zone append
+ * operations and regular write operations.
+ */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
+ disk->disk_name, zwplug->zone_no);
+ disk_zone_wplug_abort(zwplug);
+ }
+ disk_remove_zone_wplug(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ disk_put_zone_wplug(zwplug);
+}
+
+/**
+ * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
+ * @bio: The BIO being submitted
+ * @nr_segs: The number of physical segments of @bio
+ *
+ * Handle write, write zeroes and zone append operations requiring emulation
+ * using zone write plugging.
+ *
+ * Return true whenever @bio execution needs to be delayed through the zone
+ * write plug. Otherwise, return false to let the submission path process
+ * @bio normally.
+ */
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+ struct block_device *bdev = bio->bi_bdev;
+
+ if (!bdev->bd_disk->zone_wplugs_hash)
+ return false;
+
+ /*
+ * If the BIO already has the plugging flag set, then it was already
+ * handled through this path and this is a submission from the zone
+ * plug bio submit work.
+ */
+ if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+ return false;
+
+ /*
+ * We do not need to do anything special for empty flush BIOs, e.g
+ * BIOs such as issued by blkdev_issue_flush(). The is because it is
+ * the responsibility of the user to first wait for the completion of
+ * write operations for flush to have any effect on the persistence of
+ * the written data.
+ */
+ if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ return false;
+
+ /*
+ * Regular writes and write zeroes need to be handled through the target
+ * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
+ * which may need to go through the flush machinery depending on the
+ * target device capabilities. Plugging such writes is fine as the flush
+ * machinery operates at the request level, below the plug, and
+ * completion of the flush sequence will go through the regular BIO
+ * completion, which will handle zone write plugging.
+ * Zone append operations for devices that requested emulation must
+ * also be plugged so that these BIOs can be changed into regular
+ * write BIOs.
+ * Zone reset, reset all and finish commands need special treatment
+ * to correctly track the write pointer offset of zones. These commands
+ * are not plugged as we do not need serialization with write
+ * operations. It is the responsibility of the user to not issue reset
+ * and finish commands when write operations are in flight.
+ */
+ switch (bio_op(bio)) {
+ case REQ_OP_ZONE_APPEND:
+ if (!bdev_emulates_zone_append(bdev)) {
+ blk_zone_wplug_handle_native_zone_append(bio);
+ return false;
+ }
+ fallthrough;
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
+ return blk_zone_wplug_handle_write(bio, nr_segs);
+ case REQ_OP_ZONE_RESET:
+ return blk_zone_wplug_handle_reset_or_finish(bio, 0);
+ case REQ_OP_ZONE_FINISH:
+ return blk_zone_wplug_handle_reset_or_finish(bio,
+ bdev_zone_sectors(bdev));
+ case REQ_OP_ZONE_RESET_ALL:
+ return blk_zone_wplug_handle_reset_all(bio);
+ default:
+ return false;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
+
+static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+ /* Schedule submission of the next plugged BIO if we have one. */
+ if (!bio_list_empty(&zwplug->bio_list)) {
+ disk_zone_wplug_schedule_bio_work(disk, zwplug);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ return;
+ }
+
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+ /*
+ * If the zone is full (it was fully written or finished, or empty
+ * (it was reset), remove its zone write plug from the hash table.
+ */
+ if (disk_should_remove_zone_wplug(disk, zwplug))
+ disk_remove_zone_wplug(disk, zwplug);
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+void blk_zone_write_plug_bio_endio(struct bio *bio)
+{
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
+ struct blk_zone_wplug *zwplug =
+ disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ /* Make sure we do not see this BIO again by clearing the plug flag. */
+ bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+ /*
+ * If this is a regular write emulating a zone append operation,
+ * restore the original operation code.
+ */
+ if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
+ bio->bi_opf &= ~REQ_OP_MASK;
+ bio->bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+
+ /*
+ * If the BIO failed, abort all plugged BIOs and mark the plug as
+ * needing a write pointer update.
+ */
+ if (bio->bi_status != BLK_STS_OK) {
+ spin_lock_irqsave(&zwplug->lock, flags);
+ disk_zone_wplug_abort(zwplug);
+ zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ }
+
+ /* Drop the reference we took when the BIO was issued. */
+ disk_put_zone_wplug(zwplug);
+
+ /*
+ * For BIO-based devices, blk_zone_write_plug_finish_request()
+ * is not called. So we need to schedule execution of the next
+ * plugged BIO here.
+ */
+ if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /* Drop the reference we took when entering this function. */
+ disk_put_zone_wplug(zwplug);
+}
+
+void blk_zone_write_plug_finish_request(struct request *req)
+{
+ struct gendisk *disk = req->q->disk;
+ struct blk_zone_wplug *zwplug;
+
+ zwplug = disk_get_zone_wplug(disk, req->__sector);
+ if (WARN_ON_ONCE(!zwplug))
+ return;
+
+ req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
+
+ /*
+ * Drop the reference we took when the request was initialized in
+ * blk_zone_write_plug_init_request().
+ */
+ disk_put_zone_wplug(zwplug);
+
+ disk_zone_wplug_unplug_bio(disk, zwplug);
+
+ /* Drop the reference we took when entering this function. */
+ disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+ struct blk_zone_wplug *zwplug =
+ container_of(work, struct blk_zone_wplug, bio_work);
+ struct block_device *bdev;
+ unsigned long flags;
+ struct bio *bio;
+
+ /*
+ * Submit the next plugged BIO. If we do not have any, clear
+ * the plugged flag.
+ */
+ spin_lock_irqsave(&zwplug->lock, flags);
+
+again:
+ bio = bio_list_pop(&zwplug->bio_list);
+ if (!bio) {
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ goto put_zwplug;
+ }
+
+ if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+ blk_zone_wplug_bio_io_error(zwplug, bio);
+ goto again;
+ }
+
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ bdev = bio->bi_bdev;
+ submit_bio_noacct_nocheck(bio);
+
+ /*
+ * blk-mq devices will reuse the extra reference on the request queue
+ * usage counter we took when the BIO was plugged, but the submission
+ * path for BIO-based devices will not do that. So drop this extra
+ * reference here.
+ */
+ if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
+ blk_queue_exit(bdev->bd_disk->queue);
+
+put_zwplug:
+ /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
+ disk_put_zone_wplug(zwplug);
+}
+
+static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
+{
+ return 1U << disk->zone_wplugs_hash_bits;
+}
+
+void disk_init_zone_resources(struct gendisk *disk)
+{
+ spin_lock_init(&disk->zone_wplugs_lock);
+}
+
+/*
+ * For the size of a disk zone write plug hash table, use the size of the
+ * zone write plug mempool, which is the maximum of the disk open zones and
+ * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
+ * 9 bits. For a disk that has no limits, mempool size defaults to 128.
+ */
+#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
+#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
+
+static int disk_alloc_zone_resources(struct gendisk *disk,
+ unsigned int pool_size)
+{
+ unsigned int i;
+
+ atomic_set(&disk->nr_zone_wplugs, 0);
+ disk->zone_wplugs_hash_bits =
+ min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
+
+ disk->zone_wplugs_hash =
+ kcalloc(disk_zone_wplugs_hash_size(disk),
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!disk->zone_wplugs_hash)
+ return -ENOMEM;
+
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+ INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
+
+ disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
+ sizeof(struct blk_zone_wplug));
+ if (!disk->zone_wplugs_pool)
+ goto free_hash;
+
+ disk->zone_wplugs_wq =
+ alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ pool_size, disk->disk_name);
+ if (!disk->zone_wplugs_wq)
+ goto destroy_pool;
+
+ return 0;
+
+destroy_pool:
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+free_hash:
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+ return -ENOMEM;
+}
+
+static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
+{
+ struct blk_zone_wplug *zwplug;
+ unsigned int i;
+
+ if (!disk->zone_wplugs_hash)
+ return;
+
+ /* Free all the zone write plugs we have. */
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+ while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
+ zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
+ struct blk_zone_wplug, node);
+ refcount_inc(&zwplug->ref);
+ disk_remove_zone_wplug(disk, zwplug);
+ disk_put_zone_wplug(zwplug);
+ }
+ }
+
+ WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
+ kfree(disk->zone_wplugs_hash);
+ disk->zone_wplugs_hash = NULL;
+ disk->zone_wplugs_hash_bits = 0;
+}
+
+static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
+ unsigned long *bitmap)
+{
+ unsigned int nr_conv_zones = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ if (bitmap)
+ nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
+ bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
+ lockdep_is_held(&disk->zone_wplugs_lock));
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ kfree_rcu_mightsleep(bitmap);
+
+ return nr_conv_zones;
+}
+
+void disk_free_zone_resources(struct gendisk *disk)
+{
+ if (!disk->zone_wplugs_pool)
+ return;
+
+ if (disk->zone_wplugs_wq) {
+ destroy_workqueue(disk->zone_wplugs_wq);
+ disk->zone_wplugs_wq = NULL;
+ }
+
+ disk_destroy_zone_wplugs_hash_table(disk);
+
+ /*
+ * Wait for the zone write plugs to be RCU-freed before
+ * destorying the mempool.
+ */
+ rcu_barrier();
+
+ mempool_destroy(disk->zone_wplugs_pool);
+ disk->zone_wplugs_pool = NULL;
+
+ disk_set_conv_zones_bitmap(disk, NULL);
+ disk->zone_capacity = 0;
+ disk->last_zone_capacity = 0;
+ disk->nr_zones = 0;
+}
+
+static inline bool disk_need_zone_resources(struct gendisk *disk)
{
- kfree(disk->conv_zones_bitmap);
- disk->conv_zones_bitmap = NULL;
- kfree(disk->seq_zones_wlock);
- disk->seq_zones_wlock = NULL;
+ /*
+ * All mq zoned devices need zone resources so that the block layer
+ * can automatically handle write BIO plugging. BIO-based device drivers
+ * (e.g. DM devices) are normally responsible for handling zone write
+ * ordering and do not need zone resources, unless the driver requires
+ * zone append emulation.
+ */
+ return queue_is_mq(disk->queue) ||
+ queue_emulates_zone_append(disk->queue);
+}
+
+static int disk_revalidate_zone_resources(struct gendisk *disk,
+ unsigned int nr_zones)
+{
+ struct queue_limits *lim = &disk->queue->limits;
+ unsigned int pool_size;
+
+ if (!disk_need_zone_resources(disk))
+ return 0;
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
+ */
+ pool_size = max(lim->max_open_zones, lim->max_active_zones);
+ if (!pool_size)
+ pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+
+ if (!disk->zone_wplugs_hash)
+ return disk_alloc_zone_resources(disk, pool_size);
+
+ return 0;
}
struct blk_revalidate_zone_args {
struct gendisk *disk;
unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
unsigned int nr_zones;
+ unsigned int zone_capacity;
+ unsigned int last_zone_capacity;
sector_t sector;
};
/*
+ * Update the disk zone resources information and device queue limits.
+ * The disk queue is frozen when this is executed.
+ */
+static int disk_update_zone_resources(struct gendisk *disk,
+ struct blk_revalidate_zone_args *args)
+{
+ struct request_queue *q = disk->queue;
+ unsigned int nr_seq_zones, nr_conv_zones;
+ unsigned int pool_size;
+ struct queue_limits lim;
+
+ disk->nr_zones = args->nr_zones;
+ disk->zone_capacity = args->zone_capacity;
+ disk->last_zone_capacity = args->last_zone_capacity;
+ nr_conv_zones =
+ disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
+ if (nr_conv_zones >= disk->nr_zones) {
+ pr_warn("%s: Invalid number of conventional zones %u / %u\n",
+ disk->disk_name, nr_conv_zones, disk->nr_zones);
+ return -ENODEV;
+ }
+
+ lim = queue_limits_start_update(q);
+
+ /*
+ * Some devices can advertize zone resource limits that are larger than
+ * the number of sequential zones of the zoned block device, e.g. a
+ * small ZNS namespace. For such case, assume that the zoned device has
+ * no zone resource limits.
+ */
+ nr_seq_zones = disk->nr_zones - nr_conv_zones;
+ if (lim.max_open_zones >= nr_seq_zones)
+ lim.max_open_zones = 0;
+ if (lim.max_active_zones >= nr_seq_zones)
+ lim.max_active_zones = 0;
+
+ if (!disk->zone_wplugs_pool)
+ goto commit;
+
+ /*
+ * If the device has no limit on the maximum number of open and active
+ * zones, set its max open zone limit to the mempool size to indicate
+ * to the user that there is a potential performance impact due to
+ * dynamic zone write plug allocation when simultaneously writing to
+ * more zones than the size of the mempool.
+ */
+ pool_size = max(lim.max_open_zones, lim.max_active_zones);
+ if (!pool_size)
+ pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
+
+ mempool_resize(disk->zone_wplugs_pool, pool_size);
+
+ if (!lim.max_open_zones && !lim.max_active_zones) {
+ if (pool_size < nr_seq_zones)
+ lim.max_open_zones = pool_size;
+ else
+ lim.max_open_zones = 0;
+ }
+
+commit:
+ return queue_limits_commit_update_frozen(q, &lim);
+}
+
+static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
+ struct blk_revalidate_zone_args *args)
+{
+ struct gendisk *disk = args->disk;
+
+ if (zone->capacity != zone->len) {
+ pr_warn("%s: Invalid conventional zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ if (disk_zone_is_last(disk, zone))
+ args->last_zone_capacity = zone->capacity;
+
+ if (!disk_need_zone_resources(disk))
+ return 0;
+
+ if (!args->conv_zones_bitmap) {
+ args->conv_zones_bitmap =
+ bitmap_zalloc(args->nr_zones, GFP_NOIO);
+ if (!args->conv_zones_bitmap)
+ return -ENOMEM;
+ }
+
+ set_bit(idx, args->conv_zones_bitmap);
+
+ return 0;
+}
+
+static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
+ struct blk_revalidate_zone_args *args)
+{
+ struct gendisk *disk = args->disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned int wp_offset;
+ unsigned long flags;
+
+ /*
+ * Remember the capacity of the first sequential zone and check
+ * if it is constant for all zones, ignoring the last zone as it can be
+ * smaller.
+ */
+ if (!args->zone_capacity)
+ args->zone_capacity = zone->capacity;
+ if (disk_zone_is_last(disk, zone)) {
+ args->last_zone_capacity = zone->capacity;
+ } else if (zone->capacity != args->zone_capacity) {
+ pr_warn("%s: Invalid variable zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
+ /*
+ * If the device needs zone append emulation, we need to track the
+ * write pointer of all zones that are not empty nor full. So make sure
+ * we have a zone write plug for such zone if the device has a zone
+ * write plug hash table.
+ */
+ if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash)
+ return 0;
+
+ disk_zone_wplug_sync_wp_offset(disk, zone);
+
+ wp_offset = blk_zone_wp_offset(zone);
+ if (!wp_offset || wp_offset >= zone->capacity)
+ return 0;
+
+ zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
+ if (!zwplug)
+ return -ENOMEM;
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+ disk_put_zone_wplug(zwplug);
+
+ return 0;
+}
+
+/*
* Helper function to check the validity of zones of a zoned block device.
*/
static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
@@ -449,9 +1645,8 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
{
struct blk_revalidate_zone_args *args = data;
struct gendisk *disk = args->disk;
- struct request_queue *q = disk->queue;
- sector_t capacity = get_capacity(disk);
- sector_t zone_sectors = q->limits.chunk_sectors;
+ sector_t zone_sectors = disk->queue->limits.chunk_sectors;
+ int ret;
/* Check for bad zones and holes in the zone report */
if (zone->start != args->sector) {
@@ -460,7 +1655,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
- if (zone->start >= capacity || !zone->len) {
+ if (zone->start >= get_capacity(disk) || !zone->len) {
pr_warn("%s: Invalid zone start %llu, length %llu\n",
disk->disk_name, zone->start, zone->len);
return -ENODEV;
@@ -470,7 +1665,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
* All zones must have the same size, with the exception on an eventual
* smaller last zone.
*/
- if (zone->start + zone->len < capacity) {
+ if (!disk_zone_is_last(disk, zone)) {
if (zone->len != zone_sectors) {
pr_warn("%s: Invalid zoned device with non constant zone size\n",
disk->disk_name);
@@ -482,66 +1677,57 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
+ if (!zone->capacity || zone->capacity > zone->len) {
+ pr_warn("%s: Invalid zone capacity\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
/* Check zone type */
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
- if (!args->conv_zones_bitmap) {
- args->conv_zones_bitmap =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->conv_zones_bitmap)
- return -ENOMEM;
- }
- set_bit(idx, args->conv_zones_bitmap);
+ ret = blk_revalidate_conv_zone(zone, idx, args);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
- if (!args->seq_zones_wlock) {
- args->seq_zones_wlock =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->seq_zones_wlock)
- return -ENOMEM;
- }
+ ret = blk_revalidate_seq_zone(zone, idx, args);
break;
case BLK_ZONE_TYPE_SEQWRITE_PREF:
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
disk->disk_name, (int)zone->type, zone->start);
- return -ENODEV;
+ ret = -ENODEV;
}
- args->sector += zone->len;
- return 0;
+ if (!ret)
+ args->sector += zone->len;
+
+ return ret;
}
/**
- * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
+ * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
* @disk: Target disk
- * @update_driver_data: Callback to update driver data on the frozen disk
*
- * Helper function for low-level device drivers to check and (re) allocate and
- * initialize a disk request queue zone bitmaps. This functions should normally
- * be called within the disk ->revalidate method for blk-mq based drivers.
+ * Helper function for low-level device drivers to check, (re) allocate and
+ * initialize resources used for managing zoned disks. This function should
+ * normally be called by blk-mq based drivers when a zoned gendisk is probed
+ * and when the zone configuration of the gendisk changes (e.g. after a format).
* Before calling this function, the device driver must already have set the
* device zone size (chunk_sector limit) and the max zone append limit.
- * For BIO based drivers, this function cannot be used. BIO based device drivers
- * only need to set disk->nr_zones so that the sysfs exposed value is correct.
- * If the @update_driver_data callback function is not NULL, the callback is
- * executed with the device request queue frozen after all zones have been
- * checked.
+ * BIO based drivers can also use this function as long as the device queue
+ * can be safely frozen.
*/
-int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk))
+int blk_revalidate_disk_zones(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
sector_t zone_sectors = q->limits.chunk_sectors;
sector_t capacity = get_capacity(disk);
struct blk_revalidate_zone_args args = { };
unsigned int noio_flag;
- int ret;
+ int ret = -ENOMEM;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return -EIO;
- if (WARN_ON_ONCE(!queue_is_mq(q)))
- return -EIO;
if (!capacity)
return -ENODEV;
@@ -556,12 +1742,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
return -ENODEV;
}
- if (!q->limits.max_zone_append_sectors) {
- pr_warn("%s: Invalid 0 maximum zone append limit\n",
- disk->disk_name);
- return -ENODEV;
- }
-
/*
* Ensure that all memory allocations in this context are done as if
* GFP_NOIO was specified.
@@ -569,6 +1749,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
args.disk = disk;
args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
noio_flag = memalloc_noio_save();
+ ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+ if (ret) {
+ memalloc_noio_restore(noio_flag);
+ return ret;
+ }
+
ret = disk->fops->report_zones(disk, 0, UINT_MAX,
blk_revalidate_zone_cb, &args);
if (!ret) {
@@ -588,26 +1774,105 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
}
/*
- * Install the new bitmaps and update nr_zones only once the queue is
- * stopped and all I/Os are completed (i.e. a scheduler is not
- * referencing the bitmaps).
+ * Set the new disk zone parameters only once the queue is frozen and
+ * all I/Os are completed.
*/
- blk_mq_freeze_queue(q);
- if (ret > 0) {
- disk->nr_zones = args.nr_zones;
- swap(disk->seq_zones_wlock, args.seq_zones_wlock);
- swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
- if (update_driver_data)
- update_driver_data(disk);
- ret = 0;
- } else {
+ if (ret > 0)
+ ret = disk_update_zone_resources(disk, &args);
+ else
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- disk_free_zone_bitmaps(disk);
+ if (ret) {
+ unsigned int memflags = blk_mq_freeze_queue(q);
+
+ disk_free_zone_resources(disk);
+ blk_mq_unfreeze_queue(q, memflags);
}
- blk_mq_unfreeze_queue(q);
- kfree(args.seq_zones_wlock);
- kfree(args.conv_zones_bitmap);
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+
+/**
+ * blk_zone_issue_zeroout - zero-fill a block range in a zone
+ * @bdev: blockdev to write
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ * Zero-fill a block range in a zone (@sector must be equal to the zone write
+ * pointer), handling potential errors due to the (initially unknown) lack of
+ * hardware offload (See blkdev_issue_zeroout()).
+ */
+int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
+ return -EIO;
+
+ ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
+ BLKDEV_ZERO_NOFALLBACK);
+ if (ret != -EOPNOTSUPP)
+ return ret;
+
+ /*
+ * The failed call to blkdev_issue_zeroout() advanced the zone write
+ * pointer. Undo this using a report zone to update the zone write
+ * pointer to the correct current value.
+ */
+ ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector);
+ if (ret != 1)
+ return ret < 0 ? ret : -EIO;
+
+ /*
+ * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
+ * regular write with zero-pages.
+ */
+ return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
+}
+EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
+
+#ifdef CONFIG_BLK_DEBUG_FS
+static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
+ struct seq_file *m)
+{
+ unsigned int zwp_wp_offset, zwp_flags;
+ unsigned int zwp_zone_no, zwp_ref;
+ unsigned int zwp_bio_list_size;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zwplug->lock, flags);
+ zwp_zone_no = zwplug->zone_no;
+ zwp_flags = zwplug->flags;
+ zwp_ref = refcount_read(&zwplug->ref);
+ zwp_wp_offset = zwplug->wp_offset;
+ zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
+ spin_unlock_irqrestore(&zwplug->lock, flags);
+
+ seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
+ zwp_wp_offset, zwp_bio_list_size);
+}
+
+int queue_zone_wplugs_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct gendisk *disk = q->disk;
+ struct blk_zone_wplug *zwplug;
+ unsigned int i;
+
+ if (!disk->zone_wplugs_hash)
+ return 0;
+
+ rcu_read_lock();
+ for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+ hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
+ node)
+ queue_zone_wplug_show(zwplug, m);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+#endif
diff --git a/block/blk.h b/block/blk.h
index d9f584984bc4..37ec459fe656 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -2,7 +2,9 @@
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H
+#include <linux/bio-integrity.h>
#include <linux/blk-crypto.h>
+#include <linux/lockdep.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
@@ -11,6 +13,9 @@
struct elevator_type;
+#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
+#define BLK_MIN_SEGMENT_SIZE 4096
+
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
@@ -33,11 +38,13 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);
-void blk_freeze_queue(struct request_queue *q);
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
-void blk_queue_start_drain(struct request_queue *q);
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
+bool blk_queue_start_drain(struct request_queue *q);
+bool __blk_freeze_queue_start(struct request_queue *q,
+ struct task_struct *owner);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio);
+void bio_await_chain(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
@@ -67,8 +74,11 @@ static inline int bio_queue_enter(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (blk_try_enter_queue(q, false))
+ if (blk_try_enter_queue(q, false)) {
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
return 0;
+ }
return __bio_queue_enter(q, bio);
}
@@ -84,21 +94,23 @@ static inline void blk_wait_io(struct completion *done)
wait_for_completion_io(done);
}
+struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
+void blkdev_put_no_open(struct block_device *bdev);
+
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page);
+ struct page *page, unsigned len, unsigned offset);
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
{
unsigned long mask = queue_segment_boundary(q);
- phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
- phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
+ phys_addr_t addr1 = bvec_phys(vec1);
+ phys_addr_t addr2 = bvec_phys(vec2);
/*
* Merging adjacent physical pages may not work correctly under KMSAN
@@ -180,9 +192,11 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
return queue_max_segments(rq->q);
}
-static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
- enum req_op op)
+static inline unsigned int blk_queue_get_max_sectors(struct request *rq)
{
+ struct request_queue *q = rq->q;
+ enum req_op op = req_op(rq);
+
if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
return min(q->limits.max_discard_sectors,
UINT_MAX >> SECTOR_SHIFT);
@@ -190,16 +204,28 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
if (unlikely(op == REQ_OP_WRITE_ZEROES))
return q->limits.max_write_zeroes_sectors;
+ if (rq->cmd_flags & REQ_ATOMIC)
+ return q->limits.atomic_write_max_sectors;
+
return q->limits.max_sectors;
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
-bool __bio_integrity_endio(struct bio *);
void bio_integrity_free(struct bio *bio);
+
+/*
+ * Integrity payloads can either be owned by the submitter, in which case
+ * bio_uninit will free them, or owned and generated by the block layer,
+ * in which case we'll verify them here (for reads) and free them before
+ * the bio is handed back to the submitted.
+ */
+bool __bio_integrity_endio(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
- if (bio_integrity(bio))
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+
+ if (bip && (bip->bip_flags & BIP_BLOCK_INTEGRITY))
return __bio_integrity_endio(bio);
return true;
}
@@ -269,6 +295,14 @@ static inline void bio_integrity_free(struct bio *bio)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
+enum bio_merge_status {
+ BIO_MERGE_OK,
+ BIO_MERGE_NONE,
+ BIO_MERGE_FAILED,
+};
+
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
+ struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
@@ -287,11 +321,9 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
-void elevator_disable(struct request_queue *q);
-void elevator_exit(struct request_queue *q);
-int elv_register_queue(struct request_queue *q, bool uevent);
-void elv_unregister_queue(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q);
+void elevator_set_default(struct request_queue *q);
+void elevator_set_none(struct request_queue *q);
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
@@ -307,33 +339,92 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
-static inline bool bio_may_exceed_limits(struct bio *bio,
- const struct queue_limits *lim)
+struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nsegs);
+struct bio *bio_split_write_zeroes(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nsegs);
+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nr_segs);
+struct bio *bio_split_zone_append(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nr_segs);
+
+/*
+ * All drivers must accept single-segments bios that are smaller than PAGE_SIZE.
+ *
+ * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is
+ * always valid if a bio has data. The check might lead to occasional false
+ * positives when bios are cloned, but compared to the performance impact of
+ * cloned bios themselves the loop below doesn't matter anyway.
+ */
+static inline bool bio_may_need_split(struct bio *bio,
+ const struct queue_limits *lim)
+{
+ if (lim->chunk_sectors)
+ return true;
+ if (bio->bi_vcnt != 1)
+ return true;
+ return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset >
+ lim->min_segment_size;
+}
+
+/**
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ * @lim: queue limits to split based on
+ * @nr_segs: returns the number of segments in the returned bio
+ *
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it. @bio is
+ * shortened to the remainder and re-submitted.
+ *
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
+ */
+static inline struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim, unsigned int *nr_segs)
{
switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ if (bio_may_need_split(bio, lim))
+ return bio_split_rw(bio, lim, nr_segs);
+ *nr_segs = 1;
+ return bio;
+ case REQ_OP_ZONE_APPEND:
+ return bio_split_zone_append(bio, lim, nr_segs);
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ return bio_split_discard(bio, lim, nr_segs);
case REQ_OP_WRITE_ZEROES:
- return true; /* non-trivial splitting decisions */
+ return bio_split_write_zeroes(bio, lim, nr_segs);
default:
- break;
+ /* other operations can't be split */
+ *nr_segs = 0;
+ return bio;
}
+}
+/**
+ * get_max_segment_size() - maximum number of bytes to add as a single segment
+ * @lim: Request queue limits.
+ * @paddr: address of the range to add
+ * @len: maximum length available to add at @paddr
+ *
+ * Returns the maximum number of bytes of the range starting at @paddr that can
+ * be added to a single segment.
+ */
+static inline unsigned get_max_segment_size(const struct queue_limits *lim,
+ phys_addr_t paddr, unsigned int len)
+{
/*
- * All drivers must accept single-segments bios that are <= PAGE_SIZE.
- * This is a quick and dirty check that relies on the fact that
- * bi_io_vec[0] is always valid if a bio has data. The check might
- * lead to occasional false negatives when bios are cloned, but compared
- * to the performance impact of cloned bios themselves the loop below
- * doesn't matter anyway.
+ * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
+ * after having calculated the minimum.
*/
- return lim->chunk_sectors || bio->bi_vcnt != 1 ||
- bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
+ return min_t(unsigned long, len,
+ min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
+ (unsigned long)lim->max_segment_size - 1) + 1);
}
-struct bio *__bio_split_to_limits(struct bio *bio,
- const struct queue_limits *lim,
- unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -343,19 +434,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
int blk_set_default_limits(struct queue_limits *lim);
+void blk_apply_bdi_limits(struct backing_dev_info *bdi,
+ struct queue_limits *lim);
int blk_dev_init(void);
-/*
- * Contribute to IO statistics IFF:
- *
- * a) it's attached to a gendisk, and
- * b) the queue had IO stats enabled when this request was started
- */
-static inline bool blk_do_io_stat(struct request *rq)
-{
- return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
-}
-
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
@@ -378,42 +460,78 @@ static inline void ioc_clear_queue(struct request_queue *q)
}
#endif /* CONFIG_BLK_ICQ */
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
-extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
- const char *page, size_t count);
-extern void blk_throtl_bio_endio(struct bio *bio);
-extern void blk_throtl_stat_add(struct request *rq, u64 time);
-#else
-static inline void blk_throtl_bio_endio(struct bio *bio) { }
-static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
-#endif
-
-struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
-
-static inline bool blk_queue_may_bounce(struct request_queue *q)
+#ifdef CONFIG_BLK_DEV_ZONED
+void disk_init_zone_resources(struct gendisk *disk);
+void disk_free_zone_resources(struct gendisk *disk);
+static inline bool bio_zone_write_plugging(struct bio *bio)
{
- return IS_ENABLED(CONFIG_BOUNCE) &&
- q->limits.bounce == BLK_BOUNCE_HIGH &&
- max_low_pfn >= max_pfn;
+ return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
-
-static inline struct bio *blk_queue_bounce(struct bio *bio,
- struct request_queue *q)
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_init_request(struct request *rq);
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
{
- if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
- return __blk_queue_bounce(bio, q);
- return bio;
+ /*
+ * For zone append requests, the request sector indicates the location
+ * at which the BIO data was written. Return this value to the BIO
+ * issuer through the BIO iter sector.
+ * For plugged zone writes, which include emulated zone append, we need
+ * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+ * lookup the zone write plug.
+ */
+ if (req_op(rq) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
+ bio->bi_iter.bi_sector = rq->__sector;
+}
+void blk_zone_write_plug_bio_endio(struct bio *bio);
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+ /*
+ * For write BIOs to zoned devices, signal the completion of the BIO so
+ * that the next write BIO can be submitted by zone write plugging.
+ */
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_bio_endio(bio);
}
-#ifdef CONFIG_BLK_DEV_ZONED
-void disk_free_zone_bitmaps(struct gendisk *disk);
+void blk_zone_write_plug_finish_request(struct request *rq);
+static inline void blk_zone_finish_request(struct request *rq)
+{
+ if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+ blk_zone_write_plug_finish_request(rq);
+}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
-static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline void disk_init_zone_resources(struct gendisk *disk)
+{
+}
+static inline void disk_free_zone_resources(struct gendisk *disk)
+{
+}
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+ return false;
+}
+static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+}
+static inline void blk_zone_write_plug_init_request(struct request *rq)
+{
+}
+static inline void blk_zone_update_request_bio(struct request *rq,
+ struct bio *bio)
+{
+}
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+}
+static inline void blk_zone_finish_request(struct request *rq)
+{
+}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
@@ -428,12 +546,15 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
void bdev_add(struct block_device *bdev, dev_t dev);
+void bdev_unhash(struct block_device *bdev);
+void bdev_drop(struct block_device *bdev);
int blk_alloc_ext_minor(void);
void blk_free_ext_minor(unsigned int minor);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
+#define ADDPART_FLAG_READONLY 4
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
int bdev_del_partition(struct gendisk *disk, int partno);
@@ -446,10 +567,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
struct lock_class_key *lkclass);
-int bio_add_hw_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors, bool *same_page);
-
/*
* Clean up a page appropriately, where the page may be pinned, may have a
* ref taken on it or neither.
@@ -481,6 +598,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops;
@@ -600,4 +718,34 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
+void blk_integrity_generate(struct bio *bio);
+void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter);
+void blk_integrity_prepare(struct request *rq);
+void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
+
+#ifdef CONFIG_LOCKDEP
+static inline void blk_freeze_acquire_lock(struct request_queue *q)
+{
+ if (!q->mq_freeze_disk_dead)
+ rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
+ if (!q->mq_freeze_queue_dying)
+ rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
+}
+
+static inline void blk_unfreeze_release_lock(struct request_queue *q)
+{
+ if (!q->mq_freeze_queue_dying)
+ rwsem_release(&q->q_lockdep_map, _RET_IP_);
+ if (!q->mq_freeze_disk_dead)
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
+}
+#else
+static inline void blk_freeze_acquire_lock(struct request_queue *q)
+{
+}
+static inline void blk_unfreeze_release_lock(struct request_queue *q)
+{
+}
+#endif
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
deleted file mode 100644
index d6a5219f29dd..000000000000
--- a/block/bounce.c
+++ /dev/null
@@ -1,269 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* bounce buffer handling for block devices
- *
- * - Split from highmem.c
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/mm.h>
-#include <linux/export.h>
-#include <linux/swap.h>
-#include <linux/gfp.h>
-#include <linux/bio.h>
-#include <linux/pagemap.h>
-#include <linux/mempool.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/init.h>
-#include <linux/hash.h>
-#include <linux/highmem.h>
-#include <linux/printk.h>
-#include <asm/tlbflush.h>
-
-#include <trace/events/block.h>
-#include "blk.h"
-#include "blk-cgroup.h"
-
-#define POOL_SIZE 64
-#define ISA_POOL_SIZE 16
-
-static struct bio_set bounce_bio_set, bounce_bio_split;
-static mempool_t page_pool;
-
-static void init_bounce_bioset(void)
-{
- static bool bounce_bs_setup;
- int ret;
-
- if (bounce_bs_setup)
- return;
-
- ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- BUG_ON(ret);
- if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
- BUG_ON(1);
-
- ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
- BUG_ON(ret);
- bounce_bs_setup = true;
-}
-
-static __init int init_emergency_pool(void)
-{
- int ret;
-
-#ifndef CONFIG_MEMORY_HOTPLUG
- if (max_pfn <= max_low_pfn)
- return 0;
-#endif
-
- ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
- BUG_ON(ret);
- pr_info("pool size: %d pages\n", POOL_SIZE);
-
- init_bounce_bioset();
- return 0;
-}
-
-__initcall(init_emergency_pool);
-
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
- struct bio_vec tovec, fromvec;
- struct bvec_iter iter;
- /*
- * The bio of @from is created by bounce, so we can iterate
- * its bvec from start to end, but the @from->bi_iter can't be
- * trusted because it might be changed by splitting.
- */
- struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
-
- bio_for_each_segment(tovec, to, iter) {
- fromvec = bio_iter_iovec(from, from_iter);
- if (tovec.bv_page != fromvec.bv_page) {
- /*
- * fromvec->bv_offset and fromvec->bv_len might have
- * been modified by the block layer, so use the original
- * copy, bounce_copy_vec already uses tovec->bv_len
- */
- memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) +
- tovec.bv_offset);
- }
- bio_advance_iter(from, &from_iter, tovec.bv_len);
- }
-}
-
-static void bounce_end_io(struct bio *bio)
-{
- struct bio *bio_orig = bio->bi_private;
- struct bio_vec *bvec, orig_vec;
- struct bvec_iter orig_iter = bio_orig->bi_iter;
- struct bvec_iter_all iter_all;
-
- /*
- * free up bounce indirect pages used
- */
- bio_for_each_segment_all(bvec, bio, iter_all) {
- orig_vec = bio_iter_iovec(bio_orig, orig_iter);
- if (bvec->bv_page != orig_vec.bv_page) {
- dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, &page_pool);
- }
- bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
- }
-
- bio_orig->bi_status = bio->bi_status;
- bio_endio(bio_orig);
- bio_put(bio);
-}
-
-static void bounce_end_io_write(struct bio *bio)
-{
- bounce_end_io(bio);
-}
-
-static void bounce_end_io_read(struct bio *bio)
-{
- struct bio *bio_orig = bio->bi_private;
-
- if (!bio->bi_status)
- copy_to_high_bio_irq(bio_orig, bio);
-
- bounce_end_io(bio);
-}
-
-static struct bio *bounce_clone_bio(struct bio *bio_src)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- struct bio *bio;
-
- /*
- * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
- * bio_src->bi_io_vec to bio->bi_io_vec.
- *
- * We can't do that anymore, because:
- *
- * - The point of cloning the biovec is to produce a bio with a biovec
- * the caller can modify: bi_idx and bi_bvec_done should be 0.
- *
- * - The original bio could've had more than BIO_MAX_VECS biovecs; if
- * we tried to clone the whole thing bio_alloc_bioset() would fail.
- * But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_VECS.
- *
- * - Lastly, bi_vcnt should not be looked at or relied upon by code
- * that does not own the bio - reason being drivers don't use it for
- * iterating over the biovec anymore, so expecting it to be kept up
- * to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work.
- */
- bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
- bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
- if (bio_flagged(bio_src, BIO_REMAPPED))
- bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- case REQ_OP_WRITE_ZEROES:
- break;
- default:
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
- break;
- }
-
- if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0)
- goto err_put;
-
- if (bio_integrity(bio_src) &&
- bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0)
- goto err_put;
-
- bio_clone_blkg_association(bio, bio_src);
-
- return bio;
-
-err_put:
- bio_put(bio);
- return NULL;
-}
-
-struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
-{
- struct bio *bio;
- int rw = bio_data_dir(bio_orig);
- struct bio_vec *to, from;
- struct bvec_iter iter;
- unsigned i = 0, bytes = 0;
- bool bounce = false;
- int sectors;
-
- bio_for_each_segment(from, bio_orig, iter) {
- if (i++ < BIO_MAX_VECS)
- bytes += from.bv_len;
- if (PageHighMem(from.bv_page))
- bounce = true;
- }
- if (!bounce)
- return bio_orig;
-
- /*
- * Individual bvecs might not be logical block aligned. Round down
- * the split size so that each bio is properly block size aligned,
- * even if we do not use the full hardware limits.
- */
- sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
- SECTOR_SHIFT;
- if (sectors < bio_sectors(bio_orig)) {
- bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
- bio_chain(bio, bio_orig);
- submit_bio_noacct(bio_orig);
- bio_orig = bio;
- }
- bio = bounce_clone_bio(bio_orig);
-
- /*
- * Bvec table can't be updated by bio_for_each_segment_all(),
- * so retrieve bvec from the table directly. This way is safe
- * because the 'bio' is single-page bvec.
- */
- for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
- struct page *bounce_page;
-
- if (!PageHighMem(to->bv_page))
- continue;
-
- bounce_page = mempool_alloc(&page_pool, GFP_NOIO);
- inc_zone_page_state(bounce_page, NR_BOUNCE);
-
- if (rw == WRITE) {
- flush_dcache_page(to->bv_page);
- memcpy_from_bvec(page_address(bounce_page), to);
- }
- to->bv_page = bounce_page;
- }
-
- trace_block_bio_bounce(bio_orig);
-
- bio->bi_flags |= (1 << BIO_BOUNCED);
-
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read;
- else
- bio->bi_end_io = bounce_end_io_write;
-
- bio->bi_private = bio_orig;
- return bio;
-}
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index bcc7dee6abce..9ceb5d0832f5 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
if (!buf->sg_list)
return -ENOMEM;
sg_init_table(buf->sg_list, req->nr_phys_segments);
- buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+ buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list);
buf->payload_len = blk_rq_bytes(req);
return 0;
}
@@ -354,12 +354,14 @@ static const struct blk_mq_ops bsg_mq_ops = {
* bsg_setup_queue - Create and add the bsg hooks so we can receive requests
* @dev: device to attach bsg device to
* @name: device to give bsg device
+ * @lim: queue limits for the bsg queue
* @job_fn: bsg job handler
* @timeout: timeout handler function pointer
* @dd_job_size: size of LLD data needed for each job
*/
struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
- bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size)
+ struct queue_limits *lim, bsg_job_fn *job_fn,
+ bsg_timeout_fn *timeout, int dd_job_size)
{
struct bsg_set *bset;
struct blk_mq_tag_set *set;
@@ -379,17 +381,16 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
set->queue_depth = 128;
set->numa_node = NUMA_NO_NODE;
set->cmd_size = sizeof(struct bsg_job) + dd_job_size;
- set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING;
+ set->flags = BLK_MQ_F_BLOCKING;
if (blk_mq_alloc_tag_set(set))
goto out_tag_set;
- q = blk_mq_alloc_queue(set, NULL, NULL);
+ q = blk_mq_alloc_queue(set, lim, dev);
if (IS_ERR(q)) {
ret = PTR_ERR(q);
goto out_queue;
}
- q->queuedata = dev;
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn);
diff --git a/block/early-lookup.c b/block/early-lookup.c
index 3effbd0d35e9..3fb57f7d2b12 100644
--- a/block/early-lookup.c
+++ b/block/early-lookup.c
@@ -78,7 +78,7 @@ static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt)
* to the partition number found by UUID.
*/
*devt = part_devt(dev_to_disk(dev),
- dev_to_bdev(dev)->bd_partno + offset);
+ bdev_partno(dev_to_bdev(dev)) + offset);
} else {
*devt = dev->devt;
}
diff --git a/block/elevator.c b/block/elevator.c
index 5ff093cb3cf8..ab22542e6cf0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -45,6 +45,17 @@
#include "blk-wbt.h"
#include "blk-cgroup.h"
+/* Holding context data for changing elevator */
+struct elv_change_ctx {
+ const char *name;
+ bool no_uevent;
+
+ /* for unregistering old elevator */
+ struct elevator_queue *old;
+ /* for registering new elevator */
+ struct elevator_queue *new;
+};
+
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -83,13 +94,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static inline bool elv_support_features(struct request_queue *q,
- const struct elevator_type *e)
-{
- return (q->required_elevator_features & e->elevator_features) ==
- q->required_elevator_features;
-}
-
/**
* elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
@@ -113,14 +117,13 @@ static struct elevator_type *__elevator_find(const char *name)
return NULL;
}
-static struct elevator_type *elevator_find_get(struct request_queue *q,
- const char *name)
+static struct elevator_type *elevator_find_get(const char *name)
{
struct elevator_type *e;
spin_lock(&elv_list_lock);
e = __elevator_find(name);
- if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
+ if (e && (!elevator_tryget(e)))
e = NULL;
spin_unlock(&elv_list_lock);
return e;
@@ -156,18 +159,18 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
-void elevator_exit(struct request_queue *q)
+static void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
+ lockdep_assert_held(&q->elevator_lock);
+
ioc_clear_queue(q);
blk_mq_sched_free_rqs(q);
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
-
- kobject_put(&e->kobj);
}
static inline void __elv_rqhash_del(struct request *rq)
@@ -413,21 +416,22 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
return NULL;
}
-#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
+#define to_elv(atr) container_of_const((atr), struct elv_fs_entry, attr)
static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
- struct elv_fs_entry *entry = to_elv(attr);
+ const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
- ssize_t error;
+ ssize_t error = -ENODEV;
if (!entry->show)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->type ? entry->show(e, page) : -ENOENT;
+ if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
+ error = entry->show(e, page);
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -436,16 +440,17 @@ static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
- struct elv_fs_entry *entry = to_elv(attr);
+ const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
- ssize_t error;
+ ssize_t error = -ENODEV;
if (!entry->store)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->type ? entry->store(e, page, length) : -ENOENT;
+ if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
+ error = entry->store(e, page, length);
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -460,16 +465,15 @@ static const struct kobj_type elv_ktype = {
.release = elevator_release,
};
-int elv_register_queue(struct request_queue *q, bool uevent)
+static int elv_register_queue(struct request_queue *q,
+ struct elevator_queue *e,
+ bool uevent)
{
- struct elevator_queue *e = q->elevator;
int error;
- lockdep_assert_held(&q->sysfs_lock);
-
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
- struct elv_fs_entry *attr = e->type->elevator_attrs;
+ const struct elv_fs_entry *attr = e->type->elevator_attrs;
if (attr) {
while (attr->attr.name) {
if (sysfs_create_file(&e->kobj, &attr->attr))
@@ -480,20 +484,25 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
+ /*
+ * Sched is initialized, it is ready to export it via
+ * debugfs
+ */
+ blk_mq_sched_reg_debugfs(q);
set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
-void elv_unregister_queue(struct request_queue *q)
+static void elv_unregister_queue(struct request_queue *q,
+ struct elevator_queue *e)
{
- struct elevator_queue *e = q->elevator;
-
- lockdep_assert_held(&q->sysfs_lock);
-
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
+
+ /* unexport via debugfs before exiting sched */
+ blk_mq_sched_unreg_debugfs(q);
}
}
@@ -555,238 +564,267 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
-static inline bool elv_support_iosched(struct request_queue *q)
-{
- if (!queue_is_mq(q) ||
- (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
- return false;
- return true;
-}
-
/*
- * For single queue devices, default to using mq-deadline. If we have multiple
- * queues or mq-deadline is not available, default to "none".
+ * Switch to new_e io scheduler.
+ *
+ * If switching fails, we are most likely running out of memory and not able
+ * to restore the old io scheduler, so leaving the io scheduler being none.
*/
-static struct elevator_type *elevator_get_default(struct request_queue *q)
+static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
{
- if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
- return NULL;
+ struct elevator_type *new_e = NULL;
+ int ret = 0;
- if (q->nr_hw_queues != 1 &&
- !blk_mq_is_shared_tags(q->tag_set->flags))
- return NULL;
+ WARN_ON_ONCE(q->mq_freeze_depth == 0);
+ lockdep_assert_held(&q->elevator_lock);
- return elevator_find_get(q, "mq-deadline");
-}
+ if (strncmp(ctx->name, "none", 4)) {
+ new_e = elevator_find_get(ctx->name);
+ if (!new_e)
+ return -EINVAL;
+ }
-/*
- * Get the first elevator providing the features required by the request queue.
- * Default to "none" if no matching elevator is found.
- */
-static struct elevator_type *elevator_get_by_features(struct request_queue *q)
-{
- struct elevator_type *e, *found = NULL;
+ blk_mq_quiesce_queue(q);
- spin_lock(&elv_list_lock);
+ if (q->elevator) {
+ ctx->old = q->elevator;
+ elevator_exit(q);
+ }
- list_for_each_entry(e, &elv_list, list) {
- if (elv_support_features(q, e)) {
- found = e;
- break;
- }
+ if (new_e) {
+ ret = blk_mq_init_sched(q, new_e);
+ if (ret)
+ goto out_unfreeze;
+ ctx->new = q->elevator;
+ } else {
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+ q->elevator = NULL;
+ q->nr_requests = q->tag_set->queue_depth;
}
+ blk_add_trace_msg(q, "elv switch: %s", ctx->name);
- if (found && !elevator_tryget(found))
- found = NULL;
+out_unfreeze:
+ blk_mq_unquiesce_queue(q);
- spin_unlock(&elv_list_lock);
- return found;
+ if (ret) {
+ pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
+ new_e->elevator_name);
+ }
+
+ if (new_e)
+ elevator_put(new_e);
+ return ret;
}
-/*
- * For a device queue that has no required features, use the default elevator
- * settings. Otherwise, use the first elevator available matching the required
- * features. If no suitable elevator is find or if the chosen elevator
- * initialization fails, fall back to the "none" elevator (no elevator).
- */
-void elevator_init_mq(struct request_queue *q)
+static void elv_exit_and_release(struct request_queue *q)
{
- struct elevator_type *e;
- int err;
+ struct elevator_queue *e;
+ unsigned memflags;
- if (!elv_support_iosched(q))
- return;
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
+ e = q->elevator;
+ elevator_exit(q);
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+ if (e)
+ kobject_put(&e->kobj);
+}
- WARN_ON_ONCE(blk_queue_registered(q));
+static int elevator_change_done(struct request_queue *q,
+ struct elv_change_ctx *ctx)
+{
+ int ret = 0;
- if (unlikely(q->elevator))
- return;
+ if (ctx->old) {
+ bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
+ &ctx->old->flags);
- if (!q->required_elevator_features)
- e = elevator_get_default(q);
- else
- e = elevator_get_by_features(q);
- if (!e)
- return;
+ elv_unregister_queue(q, ctx->old);
+ kobject_put(&ctx->old->kobj);
+ if (enable_wbt)
+ wbt_enable_default(q->disk);
+ }
+ if (ctx->new) {
+ ret = elv_register_queue(q, ctx->new, !ctx->no_uevent);
+ if (ret)
+ elv_exit_and_release(q);
+ }
+ return ret;
+}
+
+/*
+ * Switch this queue to the given IO scheduler.
+ */
+static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
+{
+ unsigned int memflags;
+ int ret = 0;
+
+ lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
+ memflags = blk_mq_freeze_queue(q);
/*
- * We are called before adding disk, when there isn't any FS I/O,
+ * May be called before adding disk, when there isn't any FS I/O,
* so freezing queue plus canceling dispatch work is enough to
* drain any dispatch activities originated from passthrough
* requests, then no need to quiesce queue which may add long boot
* latency, especially when lots of disks are involved.
+ *
+ * Disk isn't added yet, so verifying queue lock only manually.
*/
- blk_mq_freeze_queue(q);
blk_mq_cancel_work_sync(q);
+ mutex_lock(&q->elevator_lock);
+ if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
+ ret = elevator_switch(q, ctx);
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+ if (!ret)
+ ret = elevator_change_done(q, ctx);
- err = blk_mq_init_sched(q, e);
-
- blk_mq_unfreeze_queue(q);
-
- if (err) {
- pr_warn("\"%s\" elevator initialization failed, "
- "falling back to \"none\"\n", e->elevator_name);
- }
-
- elevator_put(e);
+ return ret;
}
/*
- * Switch to new_e io scheduler.
- *
- * If switching fails, we are most likely running out of memory and not able
- * to restore the old io scheduler, so leaving the io scheduler being none.
+ * The I/O scheduler depends on the number of hardware queues, this forces a
+ * reattachment when nr_hw_queues changes.
*/
-int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
+void elv_update_nr_hw_queues(struct request_queue *q)
{
- int ret;
-
- lockdep_assert_held(&q->sysfs_lock);
-
- blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
-
- if (q->elevator) {
- elv_unregister_queue(q);
- elevator_exit(q);
- }
+ struct elv_change_ctx ctx = {};
+ int ret = -ENODEV;
- ret = blk_mq_init_sched(q, new_e);
- if (ret)
- goto out_unfreeze;
+ WARN_ON_ONCE(q->mq_freeze_depth == 0);
- ret = elv_register_queue(q, true);
- if (ret) {
- elevator_exit(q);
- goto out_unfreeze;
- }
- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+ mutex_lock(&q->elevator_lock);
+ if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
+ ctx.name = q->elevator->type->elevator_name;
-out_unfreeze:
- blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
-
- if (ret) {
- pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
- new_e->elevator_name);
+ /* force to reattach elevator after nr_hw_queue is updated */
+ ret = elevator_switch(q, &ctx);
}
-
- return ret;
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue_nomemrestore(q);
+ if (!ret)
+ WARN_ON_ONCE(elevator_change_done(q, &ctx));
}
-void elevator_disable(struct request_queue *q)
+/*
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
+ */
+void elevator_set_default(struct request_queue *q)
{
- lockdep_assert_held(&q->sysfs_lock);
+ struct elv_change_ctx ctx = {
+ .name = "mq-deadline",
+ .no_uevent = true,
+ };
+ int err = 0;
- blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
+ /* now we allow to switch elevator */
+ blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
- elv_unregister_queue(q);
- elevator_exit(q);
- blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
- q->elevator = NULL;
- q->nr_requests = q->tag_set->queue_depth;
- blk_add_trace_msg(q, "elv switch: none");
+ if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+ return;
- blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
+ /*
+ * For single queue devices, default to using mq-deadline. If we
+ * have multiple queues or mq-deadline is not available, default
+ * to "none".
+ */
+ if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
+ blk_mq_is_shared_tags(q->tag_set->flags)))
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("\"%s\" elevator initialization, failed %d, "
+ "falling back to \"none\"\n", ctx.name, err);
}
-/*
- * Switch this queue to the given IO scheduler.
- */
-static int elevator_change(struct request_queue *q, const char *elevator_name)
+void elevator_set_none(struct request_queue *q)
{
- struct elevator_type *e;
- int ret;
+ struct elv_change_ctx ctx = {
+ .name = "none",
+ };
+ int err;
- /* Make sure queue is not in the middle of being removed */
- if (!blk_queue_registered(q))
- return -ENOENT;
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("%s: set none elevator failed %d\n", __func__, err);
+}
- if (!strncmp(elevator_name, "none", 4)) {
- if (q->elevator)
- elevator_disable(q);
- return 0;
- }
+static void elv_iosched_load_module(const char *elevator_name)
+{
+ struct elevator_type *found;
- if (q->elevator && elevator_match(q->elevator->type, elevator_name))
- return 0;
+ spin_lock(&elv_list_lock);
+ found = __elevator_find(elevator_name);
+ spin_unlock(&elv_list_lock);
- e = elevator_find_get(q, elevator_name);
- if (!e) {
+ if (!found)
request_module("%s-iosched", elevator_name);
- e = elevator_find_get(q, elevator_name);
- if (!e)
- return -EINVAL;
- }
- ret = elevator_switch(q, e);
- elevator_put(e);
- return ret;
}
-ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
+ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
size_t count)
{
char elevator_name[ELV_NAME_MAX];
+ struct elv_change_ctx ctx = {};
int ret;
+ struct request_queue *q = disk->queue;
+ struct blk_mq_tag_set *set = q->tag_set;
- if (!elv_support_iosched(q))
- return count;
+ /* Make sure queue is not in the middle of being removed */
+ if (!blk_queue_registered(q))
+ return -ENOENT;
+ /*
+ * If the attribute needs to load a module, do it before freezing the
+ * queue to ensure that the module file can be read when the request
+ * queue is the one for the device storing the module file.
+ */
strscpy(elevator_name, buf, sizeof(elevator_name));
- ret = elevator_change(q, strstrip(elevator_name));
- if (!ret)
- return count;
+ ctx.name = strstrip(elevator_name);
+
+ elv_iosched_load_module(ctx.name);
+
+ down_read(&set->update_nr_hwq_lock);
+ if (!blk_queue_no_elv_switch(q)) {
+ ret = elevator_change(q, &ctx);
+ if (!ret)
+ ret = count;
+ } else {
+ ret = -ENOENT;
+ }
+ up_read(&set->update_nr_hwq_lock);
return ret;
}
-ssize_t elv_iosched_show(struct request_queue *q, char *name)
+ssize_t elv_iosched_show(struct gendisk *disk, char *name)
{
- struct elevator_queue *eq = q->elevator;
+ struct request_queue *q = disk->queue;
struct elevator_type *cur = NULL, *e;
int len = 0;
- if (!elv_support_iosched(q))
- return sprintf(name, "none\n");
-
+ mutex_lock(&q->elevator_lock);
if (!q->elevator) {
len += sprintf(name+len, "[none] ");
} else {
len += sprintf(name+len, "none ");
- cur = eq->type;
+ cur = q->elevator->type;
}
spin_lock(&elv_list_lock);
list_for_each_entry(e, &elv_list, list) {
if (e == cur)
len += sprintf(name+len, "[%s] ", e->elevator_name);
- else if (elv_support_features(q, e))
+ else
len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
len += sprintf(name+len, "\n");
+ mutex_unlock(&q->elevator_lock);
+
return len;
}
diff --git a/block/elevator.h b/block/elevator.h
index 7ca3d7b6ed82..a07ce773a38f 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -71,10 +71,9 @@ struct elevator_type
size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */
- struct elv_fs_entry *elevator_attrs;
+ const struct elv_fs_entry *elevator_attrs;
const char *elevator_name;
const char *elevator_alias;
- const unsigned int elevator_features;
struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
@@ -122,7 +121,8 @@ struct elevator_queue
};
#define ELEVATOR_FLAG_REGISTERED 0
-#define ELEVATOR_FLAG_DISABLE_WBT 1
+#define ELEVATOR_FLAG_DYING 1
+#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
/*
* block elevator interface
@@ -148,8 +148,8 @@ extern void elv_unregister(struct elevator_type *);
/*
* io scheduler sysfs switching
*/
-extern ssize_t elv_iosched_show(struct request_queue *, char *);
-extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
+ssize_t elv_iosched_show(struct gendisk *disk, char *page);
+ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
extern bool elv_bio_merge_ok(struct request *, struct bio *);
extern struct elevator_queue *elevator_alloc(struct request_queue *,
@@ -183,4 +183,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
+void blk_mq_sched_reg_debugfs(struct request_queue *q);
+void blk_mq_sched_unreg_debugfs(struct request_queue *q);
+
#endif /* _ELEVATOR_H */
diff --git a/block/fops.c b/block/fops.c
index 679d9b752fe8..1309861d4c2c 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h>
+#include <linux/io_uring/cmd.h>
#include "blk.h"
static inline struct inode *bdev_file_inode(struct file *file)
@@ -34,28 +35,26 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
return opf;
}
-static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
- struct iov_iter *iter)
+static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
+ struct iov_iter *iter)
{
- return pos & (bdev_logical_block_size(bdev) - 1) ||
+ return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
!bdev_iter_is_aligned(bdev, iter);
}
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- struct iov_iter *iter, unsigned int nr_pages)
+ struct iov_iter *iter, struct block_device *bdev,
+ unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
+ WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -74,7 +73,10 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ bio.bi_opf |= REQ_ATOMIC;
ret = bio_iov_iter_get_pages(&bio, iter);
if (unlikely(ret))
@@ -124,12 +126,16 @@ static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
+ bool is_sync = dio->flags & DIO_IS_SYNC;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
+ if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
+ bio_integrity_unmap_user(bio);
+
if (atomic_dec_and_test(&dio->ref)) {
- if (!(dio->flags & DIO_IS_SYNC)) {
+ if (!is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@@ -161,9 +167,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
+ struct block_device *bdev, unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
@@ -172,9 +177,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -205,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -225,14 +228,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* a retry of this from blocking context.
*/
if (unlikely(iov_iter_count(iter))) {
- bio_release_pages(bio, false);
- bio_clear_flag(bio, BIO_REFFED);
- bio_put(bio);
- blk_finish_plug(&plug);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto fail;
}
bio->bi_opf |= REQ_NOWAIT;
}
+ if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ if (unlikely(ret))
+ goto fail;
+ }
if (is_read) {
if (dio->flags & DIO_SHOULD_DIRTY)
@@ -273,6 +278,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio_put(&dio->bio);
return ret;
+fail:
+ bio_release_pages(bio, false);
+ bio_clear_flag(bio, BIO_REFFED);
+ bio_put(bio);
+ blk_finish_plug(&plug);
+ return ret;
}
static void blkdev_bio_end_io_async(struct bio *bio)
@@ -290,6 +301,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA)
+ bio_integrity_unmap_user(bio);
+
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
@@ -302,9 +316,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
+ struct block_device *bdev,
unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_read = iov_iter_rw(iter) == READ;
blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
@@ -312,9 +326,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -324,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -337,10 +349,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
bio_iov_bvec_set(bio, iter);
} else {
ret = bio_iov_iter_get_pages(bio, iter);
- if (unlikely(ret)) {
- bio_put(bio);
- return ret;
- }
+ if (unlikely(ret))
+ goto out_bio_put;
}
dio->size = bio->bi_iter.bi_size;
@@ -353,6 +363,16 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ WRITE_ONCE(iocb->private, NULL);
+ if (unlikely(ret))
+ goto out_bio_put;
+ }
+
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ bio->bi_opf |= REQ_ATOMIC;
+
if (iocb->ki_flags & IOCB_NOWAIT)
bio->bi_opf |= REQ_NOWAIT;
@@ -364,22 +384,53 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
submit_bio(bio);
}
return -EIOCBQUEUED;
+
+out_bio_put:
+ bio_put(bio);
+ return ret;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
+ if (blkdev_dio_invalid(bdev, iocb, iter))
+ return -EINVAL;
+
+ if (iov_iter_rw(iter) == WRITE) {
+ u16 max_write_streams = bdev_max_write_streams(bdev);
+
+ if (iocb->ki_write_stream) {
+ if (iocb->ki_write_stream > max_write_streams)
+ return -EINVAL;
+ } else if (max_write_streams) {
+ enum rw_hint write_hint =
+ file_inode(iocb->ki_filp)->i_write_hint;
+
+ /*
+ * Just use the write hint as write stream for block
+ * device writes. This assumes no file system is
+ * mounted that would use the streams differently.
+ */
+ if (write_hint <= max_write_streams)
+ iocb->ki_write_stream = write_hint;
+ }
+ }
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_simple(iocb, iter, bdev,
+ nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
+ } else if (iocb->ki_flags & IOCB_ATOMIC) {
+ return -EINVAL;
}
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}
static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -388,10 +439,11 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
struct block_device *bdev = I_BDEV(inode);
loff_t isize = i_size_read(inode);
+ if (offset >= isize)
+ return -EIO;
+
iomap->bdev = bdev;
iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
- if (iomap->offset >= isize)
- return -EIO;
iomap->type = IOMAP_MAPPED;
iomap->addr = iomap->offset;
iomap->length = isize - iomap->offset;
@@ -422,12 +474,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct folio *folio = NULL;
struct blk_plug plug;
int err;
blk_start_plug(&plug);
- err = write_cache_pages(mapping, wbc, block_write_full_folio,
- blkdev_get_block);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err)))
+ err = block_write_full_folio(folio, wbc, blkdev_get_block);
blk_finish_plug(&plug);
return err;
@@ -444,20 +497,20 @@ static void blkdev_readahead(struct readahead_control *rac)
}
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
- return block_write_begin(mapping, pos, len, pagep, blkdev_get_block);
+ return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
}
static int blkdev_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct page *page,
+ loff_t pos, unsigned len, unsigned copied, struct folio *folio,
void *fsdata)
{
int ret;
- ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
@@ -613,10 +666,13 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (ret)
return ret;
- bdev = blkdev_get_no_open(inode->i_rdev);
+ bdev = blkdev_get_no_open(inode->i_rdev, true);
if (!bdev)
return -ENXIO;
+ if (bdev_can_atomic_write(bdev))
+ filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
if (ret)
blkdev_put_no_open(bdev);
@@ -655,7 +711,7 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
- return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL);
}
/*
@@ -668,8 +724,9 @@ static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct block_device *bdev = I_BDEV(file->f_mapping->host);
- struct inode *bd_inode = bdev->bd_inode;
+ struct inode *bd_inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+ bool atomic = iocb->ki_flags & IOCB_ATOMIC;
loff_t size = bdev_nr_bytes(bdev);
size_t shorted = 0;
ssize_t ret;
@@ -689,8 +746,16 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
+ if (atomic) {
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
size -= iocb->ki_pos;
if (iov_iter_count(from) > size) {
+ if (atomic)
+ return -EINVAL;
shorted = iov_iter_count(from) - size;
iov_iter_truncate(from, size);
}
@@ -705,7 +770,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = direct_write_fallback(iocb, from, ret,
blkdev_buffered_write(iocb, from));
} else {
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with
+ * set_blocksize changing i_blkbits/folio order and punching
+ * out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = blkdev_buffered_write(iocb, from);
+ inode_unlock_shared(bd_inode);
}
if (ret > 0)
@@ -716,6 +788,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
@@ -742,16 +815,23 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
file_accessed(iocb->ki_filp);
ret = blkdev_direct_IO(iocb, to);
- if (ret >= 0) {
+ if (ret > 0) {
iocb->ki_pos += ret;
count -= ret;
}
- iov_iter_revert(to, count - iov_iter_count(to));
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
goto reexpand;
}
+ /*
+ * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
+ * changing i_blkbits/folio order and punching out the pagecache.
+ */
+ inode_lock_shared(bd_inode);
ret = filemap_read(iocb, to, ret);
+ inode_unlock_shared(bd_inode);
reexpand:
if (unlikely(shorted))
@@ -761,7 +841,7 @@ reexpand:
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+ FALLOC_FL_ZERO_RANGE)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -794,6 +874,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
/*
@@ -820,20 +901,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
len >> SECTOR_SHIFT, GFP_KERNEL,
BLKDEV_ZERO_NOFALLBACK);
break;
- case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL);
- break;
default:
error = -EOPNOTSUPP;
}
fail:
filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
return error;
}
@@ -863,6 +937,8 @@ const struct file_operations def_blk_fops = {
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
+ .uring_cmd = blkdev_uring_cmd,
+ .fop_flags = FOP_BUFFER_RASYNC,
};
static __init int blkdev_init(void)
diff --git a/block/genhd.c b/block/genhd.c
index bb29a68e1d67..8171a6bc3210 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -58,6 +58,13 @@ static DEFINE_IDA(ext_devt_ida);
void set_capacity(struct gendisk *disk, sector_t sectors)
{
+ if (sectors > BLK_DEV_MAX_SECTORS) {
+ pr_warn_once("%s: truncate capacity from %lld to %lld\n",
+ disk->disk_name, sectors,
+ BLK_DEV_MAX_SECTORS);
+ sectors = BLK_DEV_MAX_SECTORS;
+ }
+
bdev_set_nr_sectors(disk->part0, sectors);
}
EXPORT_SYMBOL(set_capacity);
@@ -118,37 +125,46 @@ static void part_stat_read_all(struct block_device *part,
}
}
-static unsigned int part_in_flight(struct block_device *part)
+static void bdev_count_inflight_rw(struct block_device *part,
+ unsigned int inflight[2], bool mq_driver)
{
- unsigned int inflight = 0;
int cpu;
- for_each_possible_cpu(cpu) {
- inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
- part_stat_local_read_cpu(part, in_flight[1], cpu);
+ if (mq_driver) {
+ blk_mq_in_driver_rw(part, inflight);
+ } else {
+ for_each_possible_cpu(cpu) {
+ inflight[READ] += part_stat_local_read_cpu(
+ part, in_flight[READ], cpu);
+ inflight[WRITE] += part_stat_local_read_cpu(
+ part, in_flight[WRITE], cpu);
+ }
}
- if ((int)inflight < 0)
- inflight = 0;
- return inflight;
+ if (WARN_ON_ONCE((int)inflight[READ] < 0))
+ inflight[READ] = 0;
+ if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
+ inflight[WRITE] = 0;
}
-static void part_in_flight_rw(struct block_device *part,
- unsigned int inflight[2])
+/**
+ * bdev_count_inflight - get the number of inflight IOs for a block device.
+ *
+ * @part: the block device.
+ *
+ * Inflight here means started IO accounting, from bdev_start_io_acct() for
+ * bio-based block device, and from blk_account_io_start() for rq-based block
+ * device.
+ */
+unsigned int bdev_count_inflight(struct block_device *part)
{
- int cpu;
+ unsigned int inflight[2] = {0};
- inflight[0] = 0;
- inflight[1] = 0;
- for_each_possible_cpu(cpu) {
- inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
- inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
- }
- if ((int)inflight[0] < 0)
- inflight[0] = 0;
- if ((int)inflight[1] < 0)
- inflight[1] = 0;
+ bdev_count_inflight_rw(part, inflight, false);
+
+ return inflight[READ] + inflight[WRITE];
}
+EXPORT_SYMBOL_GPL(bdev_count_inflight);
/*
* Can be deleted altogether. Later.
@@ -345,9 +361,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
struct file *file;
int ret = 0;
- if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
- return -EINVAL;
- if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+ if (!disk_has_partscan(disk))
return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
@@ -384,36 +398,54 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
return ret;
}
-/**
- * device_add_disk - add disk information to kernel list
- * @parent: parent device for the disk
- * @disk: per-device partitioning information
- * @groups: Additional per-device sysfs groups
- *
- * This function registers the partitioning information in @disk
- * with the kernel.
- */
-int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups)
+static void add_disk_final(struct gendisk *disk)
+{
+ struct device *ddev = disk_to_dev(disk);
+
+ if (!(disk->flags & GENHD_FL_HIDDEN)) {
+ /* Make sure the first partition scan will be proceed */
+ if (get_capacity(disk) && disk_has_partscan(disk))
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+
+ bdev_add(disk->part0, ddev->devt);
+ if (get_capacity(disk))
+ disk_scan_partitions(disk, BLK_OPEN_READ);
+
+ /*
+ * Announce the disk and partitions after all partitions are
+ * created. (for hidden disks uevents remain suppressed forever)
+ */
+ dev_set_uevent_suppress(ddev, 0);
+ disk_uevent(disk, KOBJ_ADD);
+ }
+
+ blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
+ disk_add_events(disk);
+ set_bit(GD_ADDED, &disk->state);
+}
+
+static int __add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode)
{
struct device *ddev = disk_to_dev(disk);
int ret;
- /* Only makes sense for bio-based to set ->poll_bio */
- if (queue_is_mq(disk->queue) && disk->fops->poll_bio)
+ if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS))
return -EINVAL;
- /*
- * The disk queue should now be all set with enough information about
- * the device for the elevator code to pick an adequate default
- * elevator if one is needed, that is, for devices requesting queue
- * registration.
- */
- elevator_init_mq(disk->queue);
-
- /* Mark bdev as having a submit_bio, if needed */
- disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL;
+ if (queue_is_mq(disk->queue)) {
+ /*
+ * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers.
+ */
+ if (disk->fops->submit_bio || disk->fops->poll_bio)
+ return -EINVAL;
+ } else {
+ if (!disk->fops->submit_bio)
+ return -EINVAL;
+ bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);
+ }
/*
* If the driver provides an explicit major number it also must provide
@@ -425,7 +457,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
ret = -EINVAL;
if (disk->major) {
if (WARN_ON(!disk->minors))
- goto out_exit_elevator;
+ goto out;
if (disk->minors > DISK_MAX_PARTS) {
pr_err("block: can't allocate more than %d partitions\n",
@@ -435,14 +467,14 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
if (disk->first_minor > MINORMASK ||
disk->minors > MINORMASK + 1 ||
disk->first_minor + disk->minors > MINORMASK + 1)
- goto out_exit_elevator;
+ goto out;
} else {
if (WARN_ON(disk->minors))
- goto out_exit_elevator;
+ goto out;
ret = blk_alloc_ext_minor();
if (ret < 0)
- goto out_exit_elevator;
+ goto out;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
}
@@ -453,6 +485,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
ddev->parent = parent;
ddev->groups = groups;
dev_set_name(ddev, "%s", disk->disk_name);
+ if (fwnode)
+ device_set_node(ddev, fwnode);
if (!(disk->flags & GENHD_FL_HIDDEN))
ddev->devt = MKDEV(disk->major, disk->first_minor);
ret = device_add(ddev);
@@ -501,22 +535,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
&disk->bdi->dev->kobj, "bdi");
if (ret)
goto out_unregister_bdi;
-
- /* Make sure the first partition scan will be proceed */
- if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) &&
- !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
- set_bit(GD_NEED_PART_SCAN, &disk->state);
-
- bdev_add(disk->part0, ddev->devt);
- if (get_capacity(disk))
- disk_scan_partitions(disk, BLK_OPEN_READ);
-
- /*
- * Announce the disk and partitions after all partitions are
- * created. (for hidden disks uevents remain suppressed forever)
- */
- dev_set_uevent_suppress(ddev, 0);
- disk_uevent(disk, KOBJ_ADD);
} else {
/*
* Even if the block_device for a hidden gendisk is not
@@ -525,10 +543,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
*/
disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
}
-
- disk_update_readahead(disk);
- disk_add_events(disk);
- set_bit(GD_ADDED, &disk->state);
return 0;
out_unregister_bdi:
@@ -550,11 +564,64 @@ out_device_del:
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
-out_exit_elevator:
- if (disk->queue->elevator)
- elevator_exit(disk->queue);
+out:
+ return ret;
+}
+
+/**
+ * add_disk_fwnode - add disk information to kernel list with fwnode
+ * @parent: parent device for the disk
+ * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
+ * @fwnode: attached disk fwnode
+ *
+ * This function registers the partitioning information in @disk
+ * with the kernel. Also attach a fwnode to the disk device.
+ */
+int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode)
+{
+ struct blk_mq_tag_set *set;
+ unsigned int memflags;
+ int ret;
+
+ if (queue_is_mq(disk->queue)) {
+ set = disk->queue->tag_set;
+ memflags = memalloc_noio_save();
+ down_read(&set->update_nr_hwq_lock);
+ ret = __add_disk(parent, disk, groups, fwnode);
+ up_read(&set->update_nr_hwq_lock);
+ memalloc_noio_restore(memflags);
+ } else {
+ ret = __add_disk(parent, disk, groups, fwnode);
+ }
+
+ /*
+ * add_disk_final() needn't to read `nr_hw_queues`, so move it out
+ * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
+ * lock dependency on `disk->open_mutex` from scanning partition.
+ */
+ if (!ret)
+ add_disk_final(disk);
return ret;
}
+EXPORT_SYMBOL_GPL(add_disk_fwnode);
+
+/**
+ * device_add_disk - add disk information to kernel list
+ * @parent: parent device for the disk
+ * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
+ *
+ * This function registers the partitioning information in @disk
+ * with the kernel.
+ */
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups)
+{
+ return add_disk_fwnode(parent, disk, groups, NULL);
+}
EXPORT_SYMBOL(device_add_disk);
static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
@@ -583,13 +650,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
rcu_read_unlock();
}
-static void __blk_mark_disk_dead(struct gendisk *disk)
+static bool __blk_mark_disk_dead(struct gendisk *disk)
{
/*
* Fail any new I/O.
*/
if (test_and_set_bit(GD_DEAD, &disk->state))
- return;
+ return false;
if (test_bit(GD_OWNS_QUEUE, &disk->state))
blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);
@@ -602,7 +669,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk)
/*
* Prevent new I/O from crossing bio_queue_enter().
*/
- blk_queue_start_drain(disk->queue);
+ return blk_queue_start_drain(disk->queue);
}
/**
@@ -619,30 +686,12 @@ void blk_mark_disk_dead(struct gendisk *disk)
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
-/**
- * del_gendisk - remove the gendisk
- * @disk: the struct gendisk to remove
- *
- * Removes the gendisk and all its associated resources. This deletes the
- * partitions associated with the gendisk, and unregisters the associated
- * request_queue.
- *
- * This is the counter to the respective __device_add_disk() call.
- *
- * The final removal of the struct gendisk happens when its refcount reaches 0
- * with put_disk(), which should be called after del_gendisk(), if
- * __device_add_disk() was used.
- *
- * Drivers exist which depend on the release of the gendisk to be synchronous,
- * it should not be deferred.
- *
- * Context: can sleep
- */
-void del_gendisk(struct gendisk *disk)
+static void __del_gendisk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct block_device *part;
unsigned long idx;
+ bool start_drain;
might_sleep();
@@ -656,7 +705,7 @@ void del_gendisk(struct gendisk *disk)
*/
mutex_lock(&disk->open_mutex);
xa_for_each(&disk->part_tbl, idx, part)
- remove_inode_hash(part->bd_inode);
+ bdev_unhash(part);
mutex_unlock(&disk->open_mutex);
/*
@@ -665,12 +714,14 @@ void del_gendisk(struct gendisk *disk)
*/
if (!test_bit(GD_DEAD, &disk->state))
blk_report_disk_dead(disk, false);
- __blk_mark_disk_dead(disk);
/*
* Drop all partitions now that the disk is marked dead.
*/
mutex_lock(&disk->open_mutex);
+ start_drain = __blk_mark_disk_dead(disk);
+ if (start_drain)
+ blk_freeze_acquire_lock(q);
xa_for_each_start(&disk->part_tbl, idx, part, 1)
drop_partition(part);
mutex_unlock(&disk->open_mutex);
@@ -707,25 +758,67 @@ void del_gendisk(struct gendisk *disk)
if (queue_is_mq(q))
blk_mq_cancel_work_sync(q);
- blk_mq_quiesce_queue(q);
- if (q->elevator) {
- mutex_lock(&q->sysfs_lock);
- elevator_exit(q);
- mutex_unlock(&q->sysfs_lock);
- }
rq_qos_exit(q);
- blk_mq_unquiesce_queue(q);
/*
* If the disk does not own the queue, allow using passthrough requests
* again. Else leave the queue frozen to fail all I/O.
*/
- if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
- blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+ if (!test_bit(GD_OWNS_QUEUE, &disk->state))
__blk_mq_unfreeze_queue(q, true);
+ else if (queue_is_mq(q))
+ blk_mq_exit_queue(q);
+
+ if (start_drain)
+ blk_unfreeze_release_lock(q);
+}
+
+static void disable_elv_switch(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+ WARN_ON_ONCE(!queue_is_mq(q));
+
+ down_write(&set->update_nr_hwq_lock);
+ blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
+ up_write(&set->update_nr_hwq_lock);
+}
+
+/**
+ * del_gendisk - remove the gendisk
+ * @disk: the struct gendisk to remove
+ *
+ * Removes the gendisk and all its associated resources. This deletes the
+ * partitions associated with the gendisk, and unregisters the associated
+ * request_queue.
+ *
+ * This is the counter to the respective __device_add_disk() call.
+ *
+ * The final removal of the struct gendisk happens when its refcount reaches 0
+ * with put_disk(), which should be called after del_gendisk(), if
+ * __device_add_disk() was used.
+ *
+ * Drivers exist which depend on the release of the gendisk to be synchronous,
+ * it should not be deferred.
+ *
+ * Context: can sleep
+ */
+void del_gendisk(struct gendisk *disk)
+{
+ struct blk_mq_tag_set *set;
+ unsigned int memflags;
+
+ if (!queue_is_mq(disk->queue)) {
+ __del_gendisk(disk);
} else {
- if (queue_is_mq(q))
- blk_mq_exit_queue(q);
+ set = disk->queue->tag_set;
+
+ disable_elv_switch(disk->queue);
+
+ memflags = memalloc_noio_save();
+ down_read(&set->update_nr_hwq_lock);
+ __del_gendisk(disk);
+ up_read(&set->update_nr_hwq_lock);
+ memalloc_noio_restore(memflags);
}
}
EXPORT_SYMBOL(del_gendisk);
@@ -745,7 +838,7 @@ void invalidate_disk(struct gendisk *disk)
struct block_device *bdev = disk->part0;
invalidate_bdev(bdev);
- bdev->bd_inode->i_mapping->wb_err = 0;
+ bdev->bd_mapping->wb_err = 0;
set_capacity(disk, 0);
}
EXPORT_SYMBOL(invalidate_disk);
@@ -758,7 +851,7 @@ static ssize_t disk_badblocks_show(struct device *dev,
struct gendisk *disk = dev_to_disk(dev);
if (!disk->bb)
- return sprintf(page, "\n");
+ return sysfs_emit(page, "\n");
return badblocks_show(disk->bb, page, 0);
}
@@ -776,7 +869,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
}
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
-void blk_request_module(dev_t devt)
+static bool blk_probe_dev(dev_t devt)
{
unsigned int major = MAJOR(devt);
struct blk_major_name **n;
@@ -786,14 +879,26 @@ void blk_request_module(dev_t devt)
if ((*n)->major == major && (*n)->probe) {
(*n)->probe(devt);
mutex_unlock(&major_names_lock);
- return;
+ return true;
}
}
mutex_unlock(&major_names_lock);
+ return false;
+}
+
+void blk_request_module(dev_t devt)
+{
+ int error;
- if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
- /* Make old-style 2.4 aliases work */
- request_module("block-major-%d", MAJOR(devt));
+ if (blk_probe_dev(devt))
+ return;
+
+ error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt));
+ /* Make old-style 2.4 aliases work */
+ if (error > 0)
+ error = request_module("block-major-%d", MAJOR(devt));
+ if (!error)
+ blk_probe_dev(devt);
}
#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
@@ -906,7 +1011,7 @@ static ssize_t disk_range_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", disk->minors);
+ return sysfs_emit(buf, "%d\n", disk->minors);
}
static ssize_t disk_ext_range_show(struct device *dev,
@@ -914,7 +1019,7 @@ static ssize_t disk_ext_range_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n",
+ return sysfs_emit(buf, "%d\n",
(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
}
@@ -923,7 +1028,7 @@ static ssize_t disk_removable_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n",
+ return sysfs_emit(buf, "%d\n",
(disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
}
@@ -932,7 +1037,7 @@ static ssize_t disk_hidden_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n",
+ return sysfs_emit(buf, "%d\n",
(disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}
@@ -941,35 +1046,30 @@ static ssize_t disk_ro_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
+ return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
}
ssize_t part_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
+ return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
}
ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
- struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
- if (queue_is_mq(q))
- inflight = blk_mq_in_flight(q, bdev);
- else
- inflight = part_in_flight(bdev);
-
+ inflight = bdev_count_inflight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
part_stat_unlock();
}
part_stat_read_all(bdev, &stat);
- return sprintf(buf,
+ return sysfs_emit(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
"%8u %8u %8u "
@@ -999,26 +1099,28 @@ ssize_t part_stat_show(struct device *dev,
(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
}
+/*
+ * Show the number of IOs issued to driver.
+ * For bio-based device, started from bdev_start_io_acct();
+ * For rq-based device, started from blk_mq_start_request();
+ */
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
- unsigned int inflight[2];
+ unsigned int inflight[2] = {0};
- if (queue_is_mq(q))
- blk_mq_in_flight_rw(q, bdev, inflight);
- else
- part_in_flight_rw(bdev, inflight);
+ bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));
- return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
+ return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
}
static ssize_t disk_capability_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
dev_warn_once(dev, "the capability attribute has been deprecated.\n");
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
}
static ssize_t disk_alignment_offset_show(struct device *dev,
@@ -1027,7 +1129,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
+ return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
}
static ssize_t disk_discard_alignment_show(struct device *dev,
@@ -1036,7 +1138,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
+ return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
}
static ssize_t diskseq_show(struct device *dev,
@@ -1044,7 +1146,13 @@ static ssize_t diskseq_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%llu\n", disk->diskseq);
+ return sysfs_emit(buf, "%llu\n", disk->diskseq);
+}
+
+static ssize_t partscan_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
@@ -1060,12 +1168,14 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
+static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
+ return sysfs_emit(buf, "%d\n",
+ bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL));
}
ssize_t part_fail_store(struct device *dev,
@@ -1074,9 +1184,12 @@ ssize_t part_fail_store(struct device *dev,
{
int i;
- if (count > 0 && sscanf(buf, "%d", &i) > 0)
- dev_to_bdev(dev)->bd_make_it_fail = i;
-
+ if (count > 0 && sscanf(buf, "%d", &i) > 0) {
+ if (i)
+ bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
+ else
+ bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
+ }
return count;
}
@@ -1106,6 +1219,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
&dev_attr_diskseq.attr,
+ &dev_attr_partscan.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1182,7 +1296,7 @@ static void disk_release(struct device *dev)
disk_release_events(disk);
kfree(disk->random);
- disk_free_zone_bitmaps(disk);
+ disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
@@ -1191,7 +1305,7 @@ static void disk_release(struct device *dev)
if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
disk->fops->free_disk(disk);
- iput(disk->part0->bd_inode); /* frees the disk */
+ bdev_drop(disk->part0); /* frees the disk */
}
static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
@@ -1251,51 +1365,43 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- if (queue_is_mq(gp->queue))
- inflight = blk_mq_in_flight(gp->queue, hd);
- else
- inflight = part_in_flight(hd);
+ inflight = bdev_count_inflight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
part_stat_unlock();
}
part_stat_read_all(hd, &stat);
- seq_printf(seqf, "%4d %7d %pg "
- "%lu %lu %lu %u "
- "%lu %lu %lu %u "
- "%u %u %u "
- "%lu %lu %lu %u "
- "%lu %u"
- "\n",
- MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
- stat.ios[STAT_READ],
- stat.merges[STAT_READ],
- stat.sectors[STAT_READ],
- (unsigned int)div_u64(stat.nsecs[STAT_READ],
- NSEC_PER_MSEC),
- stat.ios[STAT_WRITE],
- stat.merges[STAT_WRITE],
- stat.sectors[STAT_WRITE],
- (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
- NSEC_PER_MSEC),
- inflight,
- jiffies_to_msecs(stat.io_ticks),
- (unsigned int)div_u64(stat.nsecs[STAT_READ] +
- stat.nsecs[STAT_WRITE] +
- stat.nsecs[STAT_DISCARD] +
- stat.nsecs[STAT_FLUSH],
- NSEC_PER_MSEC),
- stat.ios[STAT_DISCARD],
- stat.merges[STAT_DISCARD],
- stat.sectors[STAT_DISCARD],
- (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
- NSEC_PER_MSEC),
- stat.ios[STAT_FLUSH],
- (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
- NSEC_PER_MSEC)
- );
+ seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4);
+ seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7);
+ seq_printf(seqf, " %pg", hd);
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]);
+ seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]);
+ seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]);
+ seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]);
+ seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", inflight);
+ seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks));
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] +
+ stat.nsecs[STAT_WRITE] +
+ stat.nsecs[STAT_DISCARD] +
+ stat.nsecs[STAT_FLUSH],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]);
+ seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]);
+ seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
+ NSEC_PER_MSEC));
+ seq_putc(seqf, '\n');
}
rcu_read_unlock();
@@ -1364,6 +1470,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (blkcg_init_disk(disk))
goto out_erase_part0;
+ disk_init_zone_resources(disk);
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
@@ -1374,6 +1481,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
+ mutex_init(&disk->rqos_state_mutex);
return disk;
out_erase_part0:
@@ -1381,7 +1489,7 @@ out_erase_part0:
out_destroy_part_tbl:
xa_destroy(&disk->part_tbl);
disk->part0->bd_disk = NULL;
- iput(disk->part0->bd_inode);
+ bdev_drop(disk->part0);
out_free_bdi:
bdi_put(disk->bdi);
out_free_bioset:
diff --git a/block/ioctl.c b/block/ioctl.c
index f505f9c341eb..e472cc1030c6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -11,7 +11,11 @@
#include <linux/blktrace_api.h>
#include <linux/pr.h>
#include <linux/uaccess.h>
+#include <linux/pagemap.h>
+#include <linux/io_uring/cmd.h>
+#include <uapi/linux/blkdev.h>
#include "blk.h"
+#include "blk-crypto-internal.h"
static int blkpg_do_ioctl(struct block_device *bdev,
struct blkpg_partition __user *upart, int op)
@@ -33,7 +37,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
if (op == BLKPG_DEL_PARTITION)
return bdev_del_partition(disk, p.pno);
- if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+ if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
return -EINVAL;
/* Check that the partition is aligned to the block size */
if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
@@ -92,49 +96,93 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif
+/*
+ * Check that [start, start + len) is a valid range from the block device's
+ * perspective, including verifying that it can be correctly translated into
+ * logical block addresses.
+ */
+static int blk_validate_byte_range(struct block_device *bdev,
+ uint64_t start, uint64_t len)
+{
+ unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
+ uint64_t end;
+
+ if ((start | len) & bs_mask)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+ if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
+ return -EINVAL;
+
+ return 0;
+}
+
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
- uint64_t range[2];
- uint64_t start, len, end;
- struct inode *inode = bdev->bd_inode;
+ uint64_t range[2], start, len;
+ struct bio *prev = NULL, *bio;
+ sector_t sector, nr_sects;
+ struct blk_plug plug;
int err;
- if (!(mode & BLK_OPEN_WRITE))
- return -EBADF;
-
- if (!bdev_max_discard_sectors(bdev))
- return -EOPNOTSUPP;
-
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
-
start = range[0];
len = range[1];
- if (start & 511)
- return -EINVAL;
- if (len & 511)
- return -EINVAL;
+ if (!bdev_max_discard_sectors(bdev))
+ return -EOPNOTSUPP;
- if (check_add_overflow(start, len, &end) ||
- end > bdev_nr_bytes(bdev))
- return -EINVAL;
+ if (!(mode & BLK_OPEN_WRITE))
+ return -EBADF;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+ err = blk_validate_byte_range(bdev, start, len);
+ if (err)
+ return err;
- filemap_invalidate_lock(inode->i_mapping);
+ inode_lock(bdev->bd_mapping->host);
+ filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
goto fail;
- err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+
+ sector = start >> SECTOR_SHIFT;
+ nr_sects = len >> SECTOR_SHIFT;
+
+ blk_start_plug(&plug);
+ while (1) {
+ if (fatal_signal_pending(current)) {
+ if (prev)
+ bio_await_chain(prev);
+ err = -EINTR;
+ goto out_unplug;
+ }
+ bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+ GFP_KERNEL);
+ if (!bio)
+ break;
+ prev = bio_chain_and_submit(prev, bio);
+ }
+ if (prev) {
+ err = submit_bio_wait(prev);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ bio_put(prev);
+ }
+out_unplug:
+ blk_finish_plug(&plug);
fail:
- filemap_invalidate_unlock(inode->i_mapping);
+ filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
void __user *argp)
{
- uint64_t start, len;
+ uint64_t start, len, end;
uint64_t range[2];
int err;
@@ -149,15 +197,18 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
len = range[1];
if ((start & 511) || (len & 511))
return -EINVAL;
- if (start + len > bdev_nr_bytes(bdev))
+ if (check_add_overflow(start, len, &end) ||
+ end > bdev_nr_bytes(bdev))
return -EINVAL;
- filemap_invalidate_lock(bdev->bd_inode->i_mapping);
- err = truncate_bdev_range(bdev, mode, start, start + len - 1);
+ inode_lock(bdev->bd_mapping->host);
+ filemap_invalidate_lock(bdev->bd_mapping);
+ err = truncate_bdev_range(bdev, mode, start, end - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
- filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
+ filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -167,7 +218,6 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
{
uint64_t range[2];
uint64_t start, end, len;
- struct inode *inode = bdev->bd_inode;
int err;
if (!(mode & BLK_OPEN_WRITE))
@@ -190,16 +240,18 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
- filemap_invalidate_lock(inode->i_mapping);
+ inode_lock(bdev->bd_mapping->host);
+ filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end);
if (err)
goto fail;
err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+ BLKDEV_ZERO_NOUNMAP | BLKDEV_ZERO_KILLABLE);
fail:
- filemap_invalidate_unlock(inode->i_mapping);
+ filemap_invalidate_unlock(bdev->bd_mapping);
+ inode_unlock(bdev->bd_mapping->host);
return err;
}
@@ -403,7 +455,10 @@ static int blkdev_roset(struct block_device *bdev, unsigned cmd,
if (ret)
return ret;
}
- bdev->bd_read_only = n;
+ if (n)
+ bdev_set_flag(bdev, BD_READ_ONLY);
+ else
+ bdev_clear_flag(bdev, BD_READ_ONLY);
return 0;
}
@@ -473,11 +528,14 @@ static int compat_hdio_getgeo(struct block_device *bdev,
#endif
/* set the logical block size */
-static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
+static int blkdev_bszset(struct file *file, blk_mode_t mode,
int __user *argp)
{
+ // this one might be file_inode(file)->i_rdev - a rare valid
+ // use of file_inode() for those.
+ dev_t dev = I_BDEV(file->f_mapping->host)->bd_dev;
+ struct file *excl_file;
int ret, n;
- struct file *file;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -487,13 +545,13 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
return -EFAULT;
if (mode & BLK_OPEN_EXCL)
- return set_blocksize(bdev, n);
+ return set_blocksize(file, n);
- file = bdev_file_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
- if (IS_ERR(file))
+ excl_file = bdev_file_open_by_dev(dev, mode, &dev, NULL);
+ if (IS_ERR(excl_file))
return -EBUSY;
- ret = set_blocksize(bdev, n);
- fput(file);
+ ret = set_blocksize(excl_file, n);
+ fput(excl_file);
return ret;
}
@@ -569,6 +627,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKCRYPTOIMPORTKEY:
+ case BLKCRYPTOGENERATEKEY:
+ case BLKCRYPTOPREPAREKEY:
+ return blk_crypto_ioctl(bdev, cmd, argp);
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, mode, argp);
case IOC_PR_RESERVE:
@@ -622,7 +684,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
return put_int(argp, block_size(bdev));
case BLKBSZSET:
- return blkdev_bszset(bdev, mode, argp);
+ return blkdev_bszset(file, mode, argp);
case BLKGETSIZE64:
return put_u64(argp, bdev_nr_bytes(bdev));
@@ -682,7 +744,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
return put_int(argp, bdev_logical_block_size(bdev));
case BLKBSZSET_32:
- return blkdev_bszset(bdev, mode, argp);
+ return blkdev_bszset(file, mode, argp);
case BLKGETSIZE64_32:
return put_u64(argp, bdev_nr_bytes(bdev));
@@ -700,3 +762,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ret;
}
#endif
+
+struct blk_iou_cmd {
+ int res;
+ bool nowait;
+};
+
+static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
+
+ if (bic->res == -EAGAIN && bic->nowait)
+ io_uring_cmd_issue_blocking(cmd);
+ else
+ io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
+}
+
+static void bio_cmd_bio_end_io(struct bio *bio)
+{
+ struct io_uring_cmd *cmd = bio->bi_private;
+ struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
+
+ if (unlikely(bio->bi_status) && !bic->res)
+ bic->res = blk_status_to_errno(bio->bi_status);
+
+ io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
+ bio_put(bio);
+}
+
+static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
+ struct block_device *bdev,
+ uint64_t start, uint64_t len, bool nowait)
+{
+ struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
+ gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
+ sector_t sector = start >> SECTOR_SHIFT;
+ sector_t nr_sects = len >> SECTOR_SHIFT;
+ struct bio *prev = NULL, *bio;
+ int err;
+
+ if (!bdev_max_discard_sectors(bdev))
+ return -EOPNOTSUPP;
+ if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
+ return -EBADF;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+ err = blk_validate_byte_range(bdev, start, len);
+ if (err)
+ return err;
+
+ err = filemap_invalidate_pages(bdev->bd_mapping, start,
+ start + len - 1, nowait);
+ if (err)
+ return err;
+
+ while (true) {
+ bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
+ if (!bio)
+ break;
+ if (nowait) {
+ /*
+ * Don't allow multi-bio non-blocking submissions as
+ * subsequent bios may fail but we won't get a direct
+ * indication of that. Normally, the caller should
+ * retry from a blocking context.
+ */
+ if (unlikely(nr_sects)) {
+ bio_put(bio);
+ return -EAGAIN;
+ }
+ bio->bi_opf |= REQ_NOWAIT;
+ }
+
+ prev = bio_chain_and_submit(prev, bio);
+ }
+ if (unlikely(!prev))
+ return -EAGAIN;
+ if (unlikely(nr_sects))
+ bic->res = -EAGAIN;
+
+ prev->bi_private = cmd;
+ prev->bi_end_io = bio_cmd_bio_end_io;
+ submit_bio(prev);
+ return -EIOCBQUEUED;
+}
+
+int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
+ struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ u32 cmd_op = cmd->cmd_op;
+ uint64_t start, len;
+
+ if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
+ sqe->rw_flags || sqe->file_index))
+ return -EINVAL;
+
+ bic->res = 0;
+ bic->nowait = issue_flags & IO_URING_F_NONBLOCK;
+
+ start = READ_ONCE(sqe->addr);
+ len = READ_ONCE(sqe->addr3);
+
+ switch (cmd_op) {
+ case BLOCK_URING_CMD_DISCARD:
+ return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
+ }
+ return -EINVAL;
+}
diff --git a/block/ioprio.c b/block/ioprio.c
index 73301a261429..f0ee2798539c 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -46,12 +46,8 @@ int ioprio_check_cap(int ioprio)
*/
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
- fallthrough;
- /* rt has prio field too */
- case IOPRIO_CLASS_BE:
- if (level >= IOPRIO_NR_LEVELS)
- return -EINVAL;
break;
+ case IOPRIO_CLASS_BE:
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 4155594aefc6..0f0f8452609a 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -568,7 +568,7 @@ static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx);
struct kyber_hctx_data *khd = hctx->sched_data;
struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
@@ -889,7 +889,7 @@ KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
#undef KYBER_LAT_SHOW_STORE
#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
-static struct elv_fs_entry kyber_sched_attrs[] = {
+static const struct elv_fs_entry kyber_sched_attrs[] = {
KYBER_LAT_ATTR(read),
KYBER_LAT_ATTR(write),
__ATTR_NULL
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 02a916ba62ee..2edf1cac06d5 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -102,7 +102,6 @@ struct deadline_data {
int prio_aging_expire;
spinlock_t lock;
- spinlock_t zone_lock;
};
/* Maps an I/O priority class to a deadline scheduler priority. */
@@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq)
}
/*
- * get the request before `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_earlier_request(struct request *rq)
-{
- struct rb_node *node = rb_prev(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * get the request after `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_latter_request(struct request *rq)
-{
- struct rb_node *node = rb_next(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-/*
- * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
- * return the first request after the start of the zone containing @pos.
+ * Return the first request for which blk_rq_pos() >= @pos.
*/
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
enum dd_data_dir data_dir, sector_t pos)
@@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
return NULL;
rq = rb_entry_rq(node);
- /*
- * A zoned write may have been requeued with a starting position that
- * is below that of the most recently dispatched request. Hence, for
- * zoned writes, start searching from the start of a zone.
- */
- if (blk_rq_is_seq_zoned_write(rq))
- pos = round_down(pos, rq->q->limits.chunk_sectors);
-
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -309,36 +271,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
}
/*
- * Check if rq has a sequential request preceding it.
- */
-static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
-{
- struct request *prev = deadline_earlier_request(rq);
-
- if (!prev)
- return false;
-
- return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
-}
-
-/*
- * Skip all write requests that are sequential from @rq, even if we cross
- * a zone boundary.
- */
-static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
- struct request *rq)
-{
- sector_t pos = blk_rq_pos(rq);
-
- do {
- pos += blk_rq_sectors(rq);
- rq = deadline_latter_request(rq);
- } while (rq && blk_rq_pos(rq) == pos);
-
- return rq;
-}
-
-/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
@@ -346,40 +278,10 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq, *rb_rq, *next;
- unsigned long flags;
-
if (list_empty(&per_prio->fifo_list[data_dir]))
return NULL;
- rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
- queuelist) {
- /* Check whether a prior request exists for the same zone. */
- rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
- if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
- rq = rb_rq;
- if (blk_req_can_dispatch_to_zone(rq) &&
- (blk_queue_nonrot(rq->q) ||
- !deadline_is_seq_write(dd, rq)))
- goto out;
- }
- rq = NULL;
-out:
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
}
/*
@@ -390,36 +292,8 @@ static struct request *
deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq;
- unsigned long flags;
-
- rq = deadline_from_pos(per_prio, data_dir,
- per_prio->latest_pos[data_dir]);
- if (!rq)
- return NULL;
-
- if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone. For some HDDs, breaking a sequential
- * write stream can lead to lower throughput, so make sure to preserve
- * sequential write streams, even if that stream crosses into the next
- * zones and these zones are unlocked.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- while (rq) {
- if (blk_req_can_dispatch_to_zone(rq))
- break;
- if (blk_queue_nonrot(rq->q))
- rq = deadline_latter_request(rq);
- else
- rq = deadline_skip_seq_writes(dd, rq);
- }
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
+ return deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
}
/*
@@ -525,10 +399,6 @@ dispatch_find_request:
rq = next_rq;
}
- /*
- * For a zoned block device, if we only have writes queued and none of
- * them can be dispatched, rq will be NULL.
- */
if (!rq)
return NULL;
@@ -549,10 +419,6 @@ done:
prio = ioprio_class_to_prio[ioprio_class];
dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
- /*
- * If the request needs its target zone locked, do it.
- */
- blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
@@ -622,6 +488,20 @@ unlock:
}
/*
+ * 'depth' is a number in the range 1..INT_MAX representing a number of
+ * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
+ * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
+ * Values larger than q->nr_requests have the same effect as q->nr_requests.
+ */
+static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
+{
+ struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
+ const unsigned int nrr = hctx->queue->nr_requests;
+
+ return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
+}
+
+/*
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
* function is used by __blk_mq_get_tag().
*/
@@ -637,7 +517,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* Throttle asynchronous requests and writes such that these requests
* do not block the allocation of synchronous requests.
*/
- data->shallow_depth = dd->async_depth;
+ data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth);
}
/* Called by blk_mq_update_nr_requests(). */
@@ -647,9 +527,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
struct deadline_data *dd = q->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
+ dd->async_depth = q->nr_requests;
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
}
/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@@ -722,7 +602,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
- spin_lock_init(&dd->zone_lock);
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -804,18 +683,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
lockdep_assert_held(&dd->lock);
- /*
- * This may be a requeue of a write request that has locked its
- * target zone. If it is the case, this releases the zone lock.
- */
- blk_req_zone_write_unlock(rq);
-
prio = ioprio_class_to_prio[ioprio_class];
per_prio = &dd->per_prio[prio];
- if (!rq->elv.priv[0]) {
+ if (!rq->elv.priv[0])
per_prio->stats.inserted++;
- rq->elv.priv[0] = (void *)(uintptr_t)1;
- }
+ rq->elv.priv[0] = per_prio;
if (blk_mq_sched_try_insert_merge(q, rq, free))
return;
@@ -826,8 +698,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
list_add(&rq->queuelist, &per_prio->dispatch);
rq->fifo_time = jiffies;
} else {
- struct list_head *insert_before;
-
deadline_add_rq_rb(per_prio, rq);
if (rq_mergeable(rq)) {
@@ -840,25 +710,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* set expire time and add to fifo list
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
- insert_before = &per_prio->fifo_list[data_dir];
-#ifdef CONFIG_BLK_DEV_ZONED
- /*
- * Insert zoned writes such that requests are sorted by
- * position per zone.
- */
- if (blk_rq_is_seq_zoned_write(rq)) {
- struct request *rq2 = deadline_latter_request(rq);
-
- if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
- insert_before = &rq2->queuelist;
- }
-#endif
- list_add_tail(&rq->queuelist, insert_before);
+ list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
}
}
/*
- * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
+ * Called from blk_mq_insert_request() or blk_mq_dispatch_list().
*/
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list,
@@ -887,62 +744,20 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
-static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
-{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- enum dd_prio p;
-
- for (p = 0; p <= DD_PRIO_MAX; p++)
- if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
- return true;
-
- return false;
-}
-
/*
* Callback from inside blk_mq_free_request().
- *
- * For zoned block devices, write unlock the target zone of
- * completed write requests. Do this while holding the zone lock
- * spinlock so that the zone is never unlocked while deadline_fifo_request()
- * or deadline_next_request() are executing. This function is called for
- * all requests, whether or not these requests complete successfully.
- *
- * For a zoned block device, __dd_dispatch_request() may have stopped
- * dispatching requests if all the queued requests are write requests directed
- * at zones that are already locked due to on-going write requests. To ensure
- * write request dispatch progress in this case, mark the queue as needing a
- * restart to ensure that the queue is run again after completion of the
- * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
- struct request_queue *q = rq->q;
- struct deadline_data *dd = q->elevator->elevator_data;
- const u8 ioprio_class = dd_rq_ioclass(rq);
- const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
- struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ struct dd_per_prio *per_prio = rq->elv.priv[0];
/*
* The block layer core may call dd_finish_request() without having
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
- if (!rq->elv.priv[0])
- return;
-
- atomic_inc(&per_prio->stats.completed);
-
- if (blk_queue_is_zoned(q)) {
- unsigned long flags;
-
- spin_lock_irqsave(&dd->zone_lock, flags);
- blk_req_zone_write_unlock(rq);
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- if (dd_has_write_work(rq->mq_hctx))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
- }
+ if (per_prio)
+ atomic_inc(&per_prio->stats.completed);
}
static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
@@ -1019,7 +834,7 @@ STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
#define DD_ATTR(name) \
__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
-static struct elv_fs_entry deadline_attrs[] = {
+static const struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(read_expire),
DD_ATTR(write_expire),
DD_ATTR(writes_starved),
@@ -1266,7 +1081,6 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
- .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 7aff4eb81c60..ce17e41451af 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -270,4 +270,13 @@ config CMDLINE_PARTITION
Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.
+config OF_PARTITION
+ bool "Device Tree partition support" if PARTITION_ADVANCED
+ depends on OF
+ help
+ Say Y here if you want to enable support for partition table
+ defined in Device Tree. (mainly for eMMC)
+ The format for the device tree node is just like MTD fixed-partition
+ schema.
+
endmenu
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
index a7f05cdb02a8..25d424922c6e 100644
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
obj-$(CONFIG_MAC_PARTITION) += mac.o
obj-$(CONFIG_LDM_PARTITION) += ldm.o
obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
+obj-$(CONFIG_OF_PARTITION) += of.o
obj-$(CONFIG_OSF_PARTITION) += osf.o
obj-$(CONFIG_SGI_PARTITION) += sgi.o
obj-$(CONFIG_SUN_PARTITION) += sun.o
diff --git a/block/partitions/check.h b/block/partitions/check.h
index 8d70a880c372..e5c1c61eb353 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -62,6 +62,7 @@ int karma_partition(struct parsed_partitions *state);
int ldm_partition(struct parsed_partitions *state);
int mac_partition(struct parsed_partitions *state);
int msdos_partition(struct parsed_partitions *state);
+int of_partition(struct parsed_partitions *state);
int osf_partition(struct parsed_partitions *state);
int sgi_partition(struct parsed_partitions *state);
int sun_partition(struct parsed_partitions *state);
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index c03bc105e575..da3e719d8e51 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -70,8 +70,8 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
}
if (*partdef == '(') {
- int length;
- char *next = strchr(++partdef, ')');
+ partdef++;
+ char *next = strsep(&partdef, ")");
if (!next) {
pr_warn("cmdline partition format is invalid.");
@@ -79,11 +79,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
goto fail;
}
- length = min_t(int, next - partdef,
- sizeof(new_subpart->name) - 1);
- strscpy(new_subpart->name, partdef, length);
-
- partdef = ++next;
+ strscpy(new_subpart->name, next, sizeof(new_subpart->name));
} else
new_subpart->name[0] = '\0';
@@ -117,14 +113,12 @@ static void free_subpart(struct cmdline_parts *parts)
}
}
-static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+static int parse_parts(struct cmdline_parts **parts, char *bdevdef)
{
int ret = -EINVAL;
char *next;
- int length;
struct cmdline_subpart **next_subpart;
struct cmdline_parts *newparts;
- char buf[BDEVNAME_SIZE + 32 + 4];
*parts = NULL;
@@ -132,28 +126,19 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
if (!newparts)
return -ENOMEM;
- next = strchr(bdevdef, ':');
+ next = strsep(&bdevdef, ":");
if (!next) {
pr_warn("cmdline partition has no block device.");
goto fail;
}
- length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
- strscpy(newparts->name, bdevdef, length);
+ strscpy(newparts->name, next, sizeof(newparts->name));
newparts->nr_subparts = 0;
next_subpart = &newparts->subpart;
- while (next && *(++next)) {
- bdevdef = next;
- next = strchr(bdevdef, ',');
-
- length = (!next) ? (sizeof(buf) - 1) :
- min_t(int, next - bdevdef, sizeof(buf) - 1);
-
- strscpy(buf, bdevdef, length);
-
- ret = parse_subpart(next_subpart, buf);
+ while ((next = strsep(&bdevdef, ","))) {
+ ret = parse_subpart(next_subpart, next);
if (ret)
goto fail;
@@ -199,24 +184,17 @@ static int cmdline_parts_parse(struct cmdline_parts **parts,
*parts = NULL;
- next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+ pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
if (!buf)
return -ENOMEM;
next_parts = parts;
- while (next && *pbuf) {
- next = strchr(pbuf, ';');
- if (next)
- *next = '\0';
-
- ret = parse_parts(next_parts, pbuf);
+ while ((next = strsep(&pbuf, ";"))) {
+ ret = parse_parts(next_parts, next);
if (ret)
goto fail;
- if (next)
- pbuf = ++next;
-
next_parts = &(*next_parts)->next_parts;
}
@@ -250,7 +228,6 @@ static struct cmdline_parts *bdev_parts;
static int add_part(int slot, struct cmdline_subpart *subpart,
struct parsed_partitions *state)
{
- int label_min;
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
@@ -260,11 +237,12 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
put_partition(state, slot, subpart->from >> 9,
subpart->size >> 9);
+ if (subpart->flags & PF_RDONLY)
+ state->parts[slot].flags |= ADDPART_FLAG_READONLY;
+
info = &state->parts[slot].info;
- label_min = min_t(int, sizeof(info->volname) - 1,
- sizeof(subpart->name));
- strscpy(info->volname, subpart->name, label_min);
+ strscpy(info->volname, subpart->name, sizeof(info->volname));
snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index b11e88c82c8c..815ed33caa1b 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -43,6 +43,9 @@ static int (*const check_part[])(struct parsed_partitions *) = {
#ifdef CONFIG_CMDLINE_PARTITION
cmdline_partition,
#endif
+#ifdef CONFIG_OF_PARTITION
+ of_partition, /* cmdline have priority to OF */
+#endif
#ifdef CONFIG_EFI_PARTITION
efi_partition, /* this must come before msdos */
#endif
@@ -173,7 +176,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
static ssize_t part_partition_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno);
+ return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev)));
}
static ssize_t part_start_show(struct device *dev,
@@ -243,16 +246,18 @@ static const struct attribute_group *part_attr_groups[] = {
static void part_release(struct device *dev)
{
put_disk(dev_to_bdev(dev)->bd_disk);
- iput(dev_to_bdev(dev)->bd_inode);
+ bdev_drop(dev_to_bdev(dev));
}
static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
const struct block_device *part = dev_to_bdev(dev);
- add_uevent_var(env, "PARTN=%u", part->bd_partno);
+ add_uevent_var(env, "PARTN=%u", bdev_partno(part));
if (part->bd_meta_info && part->bd_meta_info->volname[0])
add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname);
+ if (part->bd_meta_info && part->bd_meta_info->uuid[0])
+ add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid);
return 0;
}
@@ -267,7 +272,7 @@ void drop_partition(struct block_device *part)
{
lockdep_assert_held(&part->bd_disk->open_mutex);
- xa_erase(&part->bd_disk->part_tbl, part->bd_partno);
+ xa_erase(&part->bd_disk->part_tbl, bdev_partno(part));
kobject_put(part->bd_holder_dir);
device_del(&part->bd_device);
@@ -338,8 +343,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
pdev->parent = ddev;
/* in consecutive minor range? */
- if (bdev->bd_partno < disk->minors) {
- devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno);
+ if (bdev_partno(bdev) < disk->minors) {
+ devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev));
} else {
err = blk_alloc_ext_minor();
if (err < 0)
@@ -373,6 +378,9 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
goto out_del;
}
+ if (flags & ADDPART_FLAG_READONLY)
+ bdev_set_flag(bdev, BD_READ_ONLY);
+
/* everything is up and running, commence */
err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
if (err)
@@ -404,7 +412,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
rcu_read_lock();
xa_for_each_start(&disk->part_tbl, idx, part, 1) {
- if (part->bd_partno != skip_partno &&
+ if (bdev_partno(part) != skip_partno &&
start < part->bd_start_sect + bdev_nr_sectors(part) &&
start + length > part->bd_start_sect) {
overlap = true;
@@ -469,7 +477,7 @@ int bdev_del_partition(struct gendisk *disk, int partno)
* Just delete the partition and invalidate it.
*/
- remove_inode_hash(part->bd_inode);
+ bdev_unhash(part);
invalidate_bdev(part);
drop_partition(part);
ret = 0;
@@ -555,9 +563,11 @@ static bool blk_add_partition(struct gendisk *disk,
part = add_partition(disk, p, from, size, state->parts[p].flags,
&state->parts[p].info);
- if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
- printk(KERN_ERR " %s: p%d could not be added: %pe\n",
- disk->disk_name, p, part);
+ if (IS_ERR(part)) {
+ if (PTR_ERR(part) != -ENXIO) {
+ printk(KERN_ERR " %s: p%d could not be added: %pe\n",
+ disk->disk_name, p, part);
+ }
return true;
}
@@ -573,10 +583,7 @@ static int blk_add_partitions(struct gendisk *disk)
struct parsed_partitions *state;
int ret = -EAGAIN, p;
- if (disk->flags & GENHD_FL_NO_PART)
- return 0;
-
- if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+ if (!disk_has_partscan(disk))
return 0;
state = check_partition(disk);
@@ -655,7 +662,7 @@ rescan:
* it cannot be looked up any more even when openers
* still hold references.
*/
- remove_inode_hash(part->bd_inode);
+ bdev_unhash(part);
/*
* If @disk->open_partitions isn't elevated but there's
@@ -704,7 +711,7 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
{
- struct address_space *mapping = state->disk->part0->bd_inode->i_mapping;
+ struct address_space *mapping = state->disk->part0->bd_mapping;
struct folio *folio;
if (n >= get_capacity(state->disk)) {
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 5e9be13a56a8..7acba66eed48 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -682,7 +682,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out)
out[size] = 0;
while (i < size) {
- u8 c = le16_to_cpu(in[i]) & 0xff;
+ u8 c = le16_to_cpu(in[i]) & 0x7f;
if (c && !isprint(c))
c = '!';
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index 38e58960ae03..2bd42fedb907 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -131,8 +131,7 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
return false;
}
- strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
- toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
+ strscpy_pad(toc->bitmap1_name, data + 0x24, sizeof(toc->bitmap1_name));
toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
toc->bitmap1_size = get_unaligned_be64(data + 0x36);
@@ -142,8 +141,7 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
TOC_BITMAP1, toc->bitmap1_name);
return false;
}
- strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
- toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
+ strscpy_pad(toc->bitmap2_name, data + 0x46, sizeof(toc->bitmap2_name));
toc->bitmap2_start = get_unaligned_be64(data + 0x50);
toc->bitmap2_size = get_unaligned_be64(data + 0x58);
if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index 0a747a0c782d..aa3bd050d8cd 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* ldm - Part of the Linux-NTFS project.
*
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
@@ -15,7 +15,7 @@
#include <linux/types.h>
#include <linux/list.h>
#include <linux/fs.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <asm/byteorder.h>
struct parsed_partitions;
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index c80183156d68..b02530d98629 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state)
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
+
+ /*
+ * If the "block size" is not a power of 2, things get weird - we might
+ * end up with a partition straddling a sector boundary, so we wouldn't
+ * be able to read a partition entry with read_part_sector().
+ * Real block sizes are probably (?) powers of two, so just require
+ * that.
+ */
+ if (!is_power_of_2(secsize))
+ return -1;
datasize = round_down(secsize, 512);
data = read_part_sector(state, datasize / 512, &sect);
if (!data)
return -1;
partoffset = secsize % 512;
- if (partoffset + sizeof(*part) > datasize)
+ if (partoffset + sizeof(*part) > datasize) {
+ put_dev_sector(sect);
return -1;
+ }
part = (struct mac_partition *) (data + partoffset);
if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
put_dev_sector(sect);
@@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state)
int i, l;
goodness++;
- l = strlen(part->name);
- if (strcmp(part->name, "/") == 0)
+ l = strnlen(part->name, sizeof(part->name));
+ if (strncmp(part->name, "/", sizeof(part->name)) == 0)
goodness++;
for (i = 0; i <= l - 4; ++i) {
if (strncasecmp(part->name + i, "root",
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index b5d5c229cc3b..073be78ba0b0 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -36,7 +36,7 @@
* the nr_sects and start_sect partition table entries are
* at a 2 (mod 4) address.
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
static inline sector_t nr_sects(struct msdos_partition *p)
{
diff --git a/block/partitions/of.c b/block/partitions/of.c
new file mode 100644
index 000000000000..4e760fdffb3f
--- /dev/null
+++ b/block/partitions/of.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blkdev.h>
+#include <linux/major.h>
+#include <linux/of.h>
+#include <linux/string.h>
+#include "check.h"
+
+static int validate_of_partition(struct device_node *np, int slot)
+{
+ u64 offset, size;
+ int len;
+
+ const __be32 *reg = of_get_property(np, "reg", &len);
+ int a_cells = of_n_addr_cells(np);
+ int s_cells = of_n_size_cells(np);
+
+ /* Make sure reg len match the expected addr and size cells */
+ if (len / sizeof(*reg) != a_cells + s_cells)
+ return -EINVAL;
+
+ /* Validate offset conversion from bytes to sectors */
+ offset = of_read_number(reg, a_cells);
+ if (offset % SECTOR_SIZE)
+ return -EINVAL;
+
+ /* Validate size conversion from bytes to sectors */
+ size = of_read_number(reg + a_cells, s_cells);
+ if (!size || size % SECTOR_SIZE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void add_of_partition(struct parsed_partitions *state, int slot,
+ struct device_node *np)
+{
+ struct partition_meta_info *info;
+ char tmp[sizeof(info->volname) + 4];
+ const char *partname;
+ int len;
+
+ const __be32 *reg = of_get_property(np, "reg", &len);
+ int a_cells = of_n_addr_cells(np);
+ int s_cells = of_n_size_cells(np);
+
+ /* Convert bytes to sector size */
+ u64 offset = of_read_number(reg, a_cells) / SECTOR_SIZE;
+ u64 size = of_read_number(reg + a_cells, s_cells) / SECTOR_SIZE;
+
+ put_partition(state, slot, offset, size);
+
+ if (of_property_read_bool(np, "read-only"))
+ state->parts[slot].flags |= ADDPART_FLAG_READONLY;
+
+ /*
+ * Follow MTD label logic, search for label property,
+ * fallback to node name if not found.
+ */
+ info = &state->parts[slot].info;
+ partname = of_get_property(np, "label", &len);
+ if (!partname)
+ partname = of_get_property(np, "name", &len);
+ strscpy(info->volname, partname, sizeof(info->volname));
+
+ snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+}
+
+int of_partition(struct parsed_partitions *state)
+{
+ struct device *ddev = disk_to_dev(state->disk);
+ struct device_node *np;
+ int slot;
+
+ struct device_node *partitions_np = of_node_get(ddev->of_node);
+
+ if (!partitions_np ||
+ !of_device_is_compatible(partitions_np, "fixed-partitions"))
+ return 0;
+
+ slot = 1;
+ /* Validate parition offset and size */
+ for_each_child_of_node(partitions_np, np) {
+ if (validate_of_partition(np, slot)) {
+ of_node_put(np);
+ of_node_put(partitions_np);
+
+ return -1;
+ }
+
+ slot++;
+ }
+
+ slot = 1;
+ for_each_child_of_node(partitions_np, np) {
+ if (slot >= state->limit) {
+ of_node_put(np);
+ break;
+ }
+
+ add_of_partition(state, slot, np);
+
+ slot++;
+ }
+
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+ return 1;
+}
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
index 9cc6b8c1eea4..b5ecddd5181a 100644
--- a/block/partitions/sgi.c
+++ b/block/partitions/sgi.c
@@ -50,8 +50,6 @@ int sgi_partition(struct parsed_partitions *state)
p = &label->partitions[0];
magic = label->magic_mushroom;
if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
- /*printk("Dev %s SGI disklabel: bad magic %08x\n",
- state->disk->disk_name, be32_to_cpu(magic));*/
put_dev_sector(sect);
return 0;
}
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
index ddf9e6def4b2..2419af76120f 100644
--- a/block/partitions/sun.c
+++ b/block/partitions/sun.c
@@ -74,8 +74,6 @@ int sun_partition(struct parsed_partitions *state)
p = label->partitions;
if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
-/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
- state->disk->disk_name, be16_to_cpu(label->magic)); */
put_dev_sector(sect);
return 0;
}
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 14fe0fef811c..5a28f23f7f22 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -314,7 +314,7 @@ static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen)
&key_type_user, key_name, true);
if (IS_ERR(kref))
- ret = PTR_ERR(kref);
+ return PTR_ERR(kref);
key = key_ref_to_ptr(kref);
down_read(&key->sem);
@@ -3037,6 +3037,29 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
return ret;
}
+static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+ int ret;
+ struct opal_key *newkey = &opal_pw->new_user_pw.opal_key;
+ struct opal_key *oldkey = &opal_pw->session.opal_key;
+
+ const struct opal_step pw_steps[] = {
+ { start_SIDASP_opal_session, oldkey },
+ { set_sid_cpin_pin, newkey },
+ { end_opal_session, }
+ };
+
+ if (!dev)
+ return -ENODEV;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev);
+ ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
+ mutex_unlock(&dev->dev_lock);
+
+ return ret;
+}
+
static int opal_activate_user(struct opal_dev *dev,
struct opal_session_info *opal_session)
{
@@ -3286,6 +3309,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
case IOC_OPAL_DISCOVERY:
ret = opal_get_discv(dev, p);
break;
+ case IOC_OPAL_SET_SID_PW:
+ ret = opal_set_new_sid_pw(dev, p);
+ break;
default:
break;
diff --git a/block/t10-pi.c b/block/t10-pi.c
index d90892fd6f2a..851db518ee5e 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -8,20 +8,25 @@
#include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h>
#include <linux/crc64.h>
-#include <linux/module.h>
#include <net/checksum.h>
-#include <asm/unaligned.h>
-
-typedef __be16 (csum_fn) (__be16, void *, unsigned int);
-
-static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len)
-{
- return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len));
-}
+#include <linux/unaligned.h>
+#include "blk.h"
+
+struct blk_integrity_iter {
+ void *prot_buf;
+ void *data_buf;
+ sector_t seed;
+ unsigned int data_size;
+ unsigned short interval;
+ const char *disk_name;
+};
-static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len)
+static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len,
+ unsigned char csum_type)
{
- return (__force __be16)ip_compute_csum(data, len);
+ if (csum_type == BLK_INTEGRITY_CSUM_IP)
+ return (__force __be16)ip_compute_csum(data, len);
+ return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len));
}
/*
@@ -29,48 +34,44 @@ static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len)
* 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref
* tag.
*/
-static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
- csum_fn *fn, enum t10_dif_type type)
+static void t10_pi_generate(struct blk_integrity_iter *iter,
+ struct blk_integrity *bi)
{
- u8 offset = iter->pi_offset;
+ u8 offset = bi->pi_offset;
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf + offset;
- pi->guard_tag = fn(0, iter->data_buf, iter->interval);
+ pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval,
+ bi->csum_type);
if (offset)
- pi->guard_tag = fn(pi->guard_tag, iter->prot_buf,
- offset);
+ pi->guard_tag = t10_pi_csum(pi->guard_tag,
+ iter->prot_buf, offset, bi->csum_type);
pi->app_tag = 0;
- if (type == T10_PI_TYPE1_PROTECTION)
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed));
else
pi->ref_tag = 0;
iter->data_buf += iter->interval;
- iter->prot_buf += iter->tuple_size;
+ iter->prot_buf += bi->tuple_size;
iter->seed++;
}
-
- return BLK_STS_OK;
}
static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
- csum_fn *fn, enum t10_dif_type type)
+ struct blk_integrity *bi)
{
- u8 offset = iter->pi_offset;
+ u8 offset = bi->pi_offset;
unsigned int i;
- BUG_ON(type == T10_PI_TYPE0_PROTECTION);
-
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf + offset;
__be16 csum;
- if (type == T10_PI_TYPE1_PROTECTION ||
- type == T10_PI_TYPE2_PROTECTION) {
+ if (bi->flags & BLK_INTEGRITY_REF_TAG) {
if (pi->app_tag == T10_PI_APP_ESCAPE)
goto next;
@@ -82,15 +83,17 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
iter->seed, be32_to_cpu(pi->ref_tag));
return BLK_STS_PROTECTION;
}
- } else if (type == T10_PI_TYPE3_PROTECTION) {
+ } else {
if (pi->app_tag == T10_PI_APP_ESCAPE &&
pi->ref_tag == T10_PI_REF_ESCAPE)
goto next;
}
- csum = fn(0, iter->data_buf, iter->interval);
+ csum = t10_pi_csum(0, iter->data_buf, iter->interval,
+ bi->csum_type);
if (offset)
- csum = fn(csum, iter->prot_buf, offset);
+ csum = t10_pi_csum(csum, iter->prot_buf, offset,
+ bi->csum_type);
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
@@ -102,33 +105,13 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += iter->tuple_size;
+ iter->prot_buf += bi->tuple_size;
iter->seed++;
}
return BLK_STS_OK;
}
-static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
-{
- return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION);
-}
-
-static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
-{
- return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION);
-}
-
-static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
-{
- return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION);
-}
-
-static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
-{
- return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION);
-}
-
/**
* t10_pi_type1_prepare - prepare PI prior submitting request to device
* @rq: request with PI that should be prepared
@@ -141,7 +124,7 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
*/
static void t10_pi_type1_prepare(struct request *rq)
{
- struct blk_integrity *bi = &rq->q->integrity;
+ struct blk_integrity *bi = &rq->q->limits.integrity;
const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
@@ -192,7 +175,7 @@ static void t10_pi_type1_prepare(struct request *rq)
*/
static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
- struct blk_integrity *bi = &rq->q->integrity;
+ struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
@@ -225,81 +208,15 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
}
}
-static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
-{
- return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION);
-}
-
-static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
-{
- return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION);
-}
-
-static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
-{
- return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION);
-}
-
-static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
-{
- return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION);
-}
-
-/* Type 3 does not have a reference tag so no remapping is required. */
-static void t10_pi_type3_prepare(struct request *rq)
-{
-}
-
-/* Type 3 does not have a reference tag so no remapping is required. */
-static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes)
-{
-}
-
-const struct blk_integrity_profile t10_pi_type1_crc = {
- .name = "T10-DIF-TYPE1-CRC",
- .generate_fn = t10_pi_type1_generate_crc,
- .verify_fn = t10_pi_type1_verify_crc,
- .prepare_fn = t10_pi_type1_prepare,
- .complete_fn = t10_pi_type1_complete,
-};
-EXPORT_SYMBOL(t10_pi_type1_crc);
-
-const struct blk_integrity_profile t10_pi_type1_ip = {
- .name = "T10-DIF-TYPE1-IP",
- .generate_fn = t10_pi_type1_generate_ip,
- .verify_fn = t10_pi_type1_verify_ip,
- .prepare_fn = t10_pi_type1_prepare,
- .complete_fn = t10_pi_type1_complete,
-};
-EXPORT_SYMBOL(t10_pi_type1_ip);
-
-const struct blk_integrity_profile t10_pi_type3_crc = {
- .name = "T10-DIF-TYPE3-CRC",
- .generate_fn = t10_pi_type3_generate_crc,
- .verify_fn = t10_pi_type3_verify_crc,
- .prepare_fn = t10_pi_type3_prepare,
- .complete_fn = t10_pi_type3_complete,
-};
-EXPORT_SYMBOL(t10_pi_type3_crc);
-
-const struct blk_integrity_profile t10_pi_type3_ip = {
- .name = "T10-DIF-TYPE3-IP",
- .generate_fn = t10_pi_type3_generate_ip,
- .verify_fn = t10_pi_type3_verify_ip,
- .prepare_fn = t10_pi_type3_prepare,
- .complete_fn = t10_pi_type3_complete,
-};
-EXPORT_SYMBOL(t10_pi_type3_ip);
-
static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len)
{
- return cpu_to_be64(crc64_rocksoft_update(crc, data, len));
+ return cpu_to_be64(crc64_nvme(crc, data, len));
}
-static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter,
- enum t10_dif_type type)
+static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
+ struct blk_integrity *bi)
{
- u8 offset = iter->pi_offset;
+ u8 offset = bi->pi_offset;
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
@@ -311,30 +228,28 @@ static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter,
iter->prot_buf, offset);
pi->app_tag = 0;
- if (type == T10_PI_TYPE1_PROTECTION)
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
put_unaligned_be48(iter->seed, pi->ref_tag);
else
put_unaligned_be48(0ULL, pi->ref_tag);
iter->data_buf += iter->interval;
- iter->prot_buf += iter->tuple_size;
+ iter->prot_buf += bi->tuple_size;
iter->seed++;
}
-
- return BLK_STS_OK;
}
-static bool ext_pi_ref_escape(u8 *ref_tag)
+static bool ext_pi_ref_escape(const u8 ref_tag[6])
{
- static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+ static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0;
}
static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
- enum t10_dif_type type)
+ struct blk_integrity *bi)
{
- u8 offset = iter->pi_offset;
+ u8 offset = bi->pi_offset;
unsigned int i;
for (i = 0; i < iter->data_size; i += iter->interval) {
@@ -342,7 +257,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
u64 ref, seed;
__be64 csum;
- if (type == T10_PI_TYPE1_PROTECTION) {
+ if (bi->flags & BLK_INTEGRITY_REF_TAG) {
if (pi->app_tag == T10_PI_APP_ESCAPE)
goto next;
@@ -353,7 +268,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
iter->disk_name, seed, ref);
return BLK_STS_PROTECTION;
}
- } else if (type == T10_PI_TYPE3_PROTECTION) {
+ } else {
if (pi->app_tag == T10_PI_APP_ESCAPE &&
ext_pi_ref_escape(pi->ref_tag))
goto next;
@@ -374,26 +289,16 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += iter->tuple_size;
+ iter->prot_buf += bi->tuple_size;
iter->seed++;
}
return BLK_STS_OK;
}
-static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter)
-{
- return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION);
-}
-
-static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter)
-{
- return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION);
-}
-
static void ext_pi_type1_prepare(struct request *rq)
{
- struct blk_integrity *bi = &rq->q->integrity;
+ struct blk_integrity *bi = &rq->q->limits.integrity;
const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
@@ -433,7 +338,7 @@ static void ext_pi_type1_prepare(struct request *rq)
static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
- struct blk_integrity *bi = &rq->q->integrity;
+ struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq);
@@ -467,33 +372,102 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
}
}
-static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter)
+void blk_integrity_generate(struct bio *bio)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct blk_integrity_iter iter;
+ struct bvec_iter bviter;
+ struct bio_vec bv;
+
+ iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ iter.interval = 1 << bi->interval_exp;
+ iter.seed = bio->bi_iter.bi_sector;
+ iter.prot_buf = bvec_virt(bip->bip_vec);
+ bio_for_each_segment(bv, bio, bviter) {
+ void *kaddr = bvec_kmap_local(&bv);
+
+ iter.data_buf = kaddr;
+ iter.data_size = bv.bv_len;
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ ext_pi_crc64_generate(&iter, bi);
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ t10_pi_generate(&iter, bi);
+ break;
+ default:
+ break;
+ }
+ kunmap_local(kaddr);
+ }
+}
+
+void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter)
{
- return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct blk_integrity_iter iter;
+ struct bvec_iter bviter;
+ struct bio_vec bv;
+
+ /*
+ * At the moment verify is called bi_iter has been advanced during split
+ * and completion, so use the copy created during submission here.
+ */
+ iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ iter.interval = 1 << bi->interval_exp;
+ iter.seed = saved_iter->bi_sector;
+ iter.prot_buf = bvec_virt(bip->bip_vec);
+ __bio_for_each_segment(bv, bio, bviter, *saved_iter) {
+ void *kaddr = bvec_kmap_local(&bv);
+ blk_status_t ret = BLK_STS_OK;
+
+ iter.data_buf = kaddr;
+ iter.data_size = bv.bv_len;
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ ret = ext_pi_crc64_verify(&iter, bi);
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ ret = t10_pi_verify(&iter, bi);
+ break;
+ default:
+ break;
+ }
+ kunmap_local(kaddr);
+
+ if (ret) {
+ bio->bi_status = ret;
+ return;
+ }
+ }
}
-static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter)
+void blk_integrity_prepare(struct request *rq)
{
- return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION);
+ struct blk_integrity *bi = &rq->q->limits.integrity;
+
+ if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
+ return;
+
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64)
+ ext_pi_type1_prepare(rq);
+ else
+ t10_pi_type1_prepare(rq);
}
-const struct blk_integrity_profile ext_pi_type1_crc64 = {
- .name = "EXT-DIF-TYPE1-CRC64",
- .generate_fn = ext_pi_type1_generate_crc64,
- .verify_fn = ext_pi_type1_verify_crc64,
- .prepare_fn = ext_pi_type1_prepare,
- .complete_fn = ext_pi_type1_complete,
-};
-EXPORT_SYMBOL_GPL(ext_pi_type1_crc64);
-
-const struct blk_integrity_profile ext_pi_type3_crc64 = {
- .name = "EXT-DIF-TYPE3-CRC64",
- .generate_fn = ext_pi_type3_generate_crc64,
- .verify_fn = ext_pi_type3_verify_crc64,
- .prepare_fn = t10_pi_type3_prepare,
- .complete_fn = t10_pi_type3_complete,
-};
-EXPORT_SYMBOL_GPL(ext_pi_type3_crc64);
+void blk_integrity_complete(struct request *rq, unsigned int nr_bytes)
+{
+ struct blk_integrity *bi = &rq->q->limits.integrity;
-MODULE_LICENSE("GPL");
-MODULE_LICENSE("GPL");
+ if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
+ return;
+
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64)
+ ext_pi_type1_complete(rq, nr_bytes);
+ else
+ t10_pi_type1_complete(rq, nr_bytes);
+}