summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig14
-rw-r--r--block/Makefile3
-rw-r--r--block/bdev.c13
-rw-r--r--block/bfq-cgroup.c16
-rw-r--r--block/bfq-iosched.c51
-rw-r--r--block/bfq-iosched.h2
-rw-r--r--block/bfq-wf2q.c17
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c194
-rw-r--r--block/blk-cgroup-rwstat.h2
-rw-r--r--block/blk-cgroup.c52
-rw-r--r--block/blk-cgroup.h494
-rw-r--r--block/blk-core.c322
-rw-r--r--block/blk-crypto-fallback.c3
-rw-r--r--block/blk-crypto-internal.h12
-rw-r--r--block/blk-crypto-sysfs.c172
-rw-r--r--block/blk-crypto.c4
-rw-r--r--block/blk-flush.c4
-rw-r--r--block/blk-ioc.c3
-rw-r--r--block/blk-iocost.c14
-rw-r--r--block/blk-iolatency.c4
-rw-r--r--block/blk-ioprio.c2
-rw-r--r--block/blk-lib.c132
-rw-r--r--block/blk-merge.c87
-rw-r--r--block/blk-mq-debugfs.c30
-rw-r--r--block/blk-mq-debugfs.h2
-rw-r--r--block/blk-mq-sched.c18
-rw-r--r--block/blk-mq-sysfs.c16
-rw-r--r--block/blk-mq-tag.c6
-rw-r--r--block/blk-mq.c339
-rw-r--r--block/blk-mq.h2
-rw-r--r--block/blk-rq-qos.h20
-rw-r--r--block/blk-settings.c16
-rw-r--r--block/blk-sysfs.c47
-rw-r--r--block/blk-throttle.c110
-rw-r--r--block/blk-throttle.h19
-rw-r--r--block/blk-wbt.h3
-rw-r--r--block/blk-zoned.c15
-rw-r--r--block/blk.h11
-rw-r--r--block/bounce.c15
-rw-r--r--block/disk-events.c2
-rw-r--r--block/elevator.c16
-rw-r--r--block/fops.c41
-rw-r--r--block/genhd.c73
-rw-r--r--block/holder.c2
-rw-r--r--block/ioctl.c2
-rw-r--r--block/mq-deadline.c1
-rw-r--r--block/partitions/check.h1
-rw-r--r--block/partitions/core.c1
-rw-r--r--block/partitions/efi.h1
-rw-r--r--block/partitions/ldm.h1
-rw-r--r--block/sed-opal.c2
-rw-r--r--block/t10-pi.c198
53 files changed, 1710 insertions, 919 deletions
diff --git a/block/Kconfig b/block/Kconfig
index d5d4197b7ed2..50b17e260fa2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,16 @@ menuconfig BLOCK
if BLOCK
+config BLOCK_LEGACY_AUTOLOAD
+ bool "Legacy autoloading support"
+ default y
+ help
+ Enable loading modules and creating block device instances based on
+ accesses through their device special file. This is a historic Linux
+ feature and makes no sense in a udev world where device files are
+ created on demand, but scripts that manually create device nodes and
+ then call losetup might rely on this behavior.
+
config BLK_RQ_ALLOC_TIME
bool
@@ -63,6 +73,7 @@ config BLK_DEV_INTEGRITY_T10
tristate
depends on BLK_DEV_INTEGRITY
select CRC_T10DIF
+ select CRC64_ROCKSOFT
config BLK_DEV_ZONED
bool "Zoned block device support"
@@ -218,6 +229,9 @@ config BLK_PM
config BLOCK_HOLDER_DEPRECATED
bool
+config BLK_MQ_STACKING
+ bool
+
source "block/Kconfig.iosched"
endif # BLOCK
diff --git a/block/Makefile b/block/Makefile
index f38eaa612929..3950ecbc5c26 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
+ blk-crypto-sysfs.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --git a/block/bdev.c b/block/bdev.c
index 102837a37051..13de871fa816 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -385,7 +385,7 @@ static struct kmem_cache * bdev_cachep __read_mostly;
static struct inode *bdev_alloc_inode(struct super_block *sb)
{
- struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
+ struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -678,7 +678,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
if (test_bit(GD_NEED_PART_SCAN, &disk->state))
bdev_disk_changed(disk, false);
bdev->bd_openers++;
- return 0;;
+ return 0;
}
static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
@@ -733,12 +733,15 @@ struct block_device *blkdev_get_no_open(dev_t dev)
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
- if (!inode) {
+ if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
- if (!inode)
- return NULL;
+ if (inode)
+ pr_warn_ratelimited(
+"block device autoloading is deprecated and will be removed.\n");
}
+ if (!inode)
+ return NULL;
/* switch from the inode reference to a device mode one: */
bdev = &BDEV_I(inode)->bdev;
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..420eda2589c0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -645,8 +645,22 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg)
{
struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_group *old_parent = bfqq_group(bfqq);
/*
+ * No point to move bfqq to the same group, which can happen when
+ * root group is offlined
+ */
+ if (old_parent == bfqg)
+ return;
+
+ /*
+ * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
+ * until elevator exit.
+ */
+ if (bfqq == &bfqd->oom_bfqq)
+ return;
+ /*
* Get extra reference to prevent bfqq from being freed in
* next possible expire or deactivate.
*/
@@ -666,7 +680,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
else if (entity->on_st_or_in_serv)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- bfqg_and_blkg_put(bfqq_group(bfqq));
+ bfqg_and_blkg_put(old_parent);
if (entity->parent &&
entity->parent->last_bfqq_created == bfqq)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 36a66e97e3c2..1f62dbdc521f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -569,7 +569,7 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
struct bfq_entity *entity = &bfqq->entity;
struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
struct bfq_entity **entities = inline_entities;
- int depth, level;
+ int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH;
int class_idx = bfqq->ioprio_class - 1;
struct bfq_sched_data *sched_data;
unsigned long wsum;
@@ -578,15 +578,21 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
if (!entity->on_st_or_in_serv)
return false;
+retry:
+ spin_lock_irq(&bfqd->lock);
/* +1 for bfqq entity, root cgroup not included */
depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
- if (depth > BFQ_LIMIT_INLINE_DEPTH) {
+ if (depth > alloc_depth) {
+ spin_unlock_irq(&bfqd->lock);
+ if (entities != inline_entities)
+ kfree(entities);
entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
if (!entities)
return false;
+ alloc_depth = depth;
+ goto retry;
}
- spin_lock_irq(&bfqd->lock);
sched_data = entity->sched_data;
/* Gather our ancestors as we need to traverse them in reverse order */
level = 0;
@@ -774,7 +780,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (!bfqq->next_rq)
return;
- bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
blk_rq_pos(bfqq->next_rq), &parent, &p);
if (!__bfqq) {
@@ -2153,7 +2159,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->waker_detection_started = now_ns;
bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
MAX_BFQQ_NAME_LENGTH);
- bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
+ bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name);
} else /* Same tentative waker queue detected again */
bfqq->num_waker_detections++;
@@ -2669,7 +2675,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
sector_t sector)
{
- struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
struct rb_node *parent, *node;
struct bfq_queue *__bfqq;
@@ -2782,6 +2788,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
* are likely to increase the throughput.
*/
bfqq->new_bfqq = new_bfqq;
+ /*
+ * The above assignment schedules the following redirections:
+ * each time some I/O for bfqq arrives, the process that
+ * generated that I/O is disassociated from bfqq and
+ * associated with new_bfqq. Here we increases new_bfqq->ref
+ * in advance, adding the number of processes that are
+ * expected to be associated with new_bfqq as they happen to
+ * issue I/O.
+ */
new_bfqq->ref += process_refs;
return new_bfqq;
}
@@ -2844,6 +2859,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_queue *in_service_bfqq, *new_bfqq;
+ /* if a merge has already been setup, then proceed with that first */
+ if (bfqq->new_bfqq)
+ return bfqq->new_bfqq;
+
/*
* Check delayed stable merge for rotational or non-queueing
* devs. For this branch to be executed, bfqq must not be
@@ -2945,9 +2964,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfq_too_late_for_merging(bfqq))
return NULL;
- if (bfqq->new_bfqq)
- return bfqq->new_bfqq;
-
if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
return NULL;
@@ -5181,7 +5197,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct request *rq;
struct bfq_queue *in_serv_queue;
- bool waiting_rq, idle_timer_disabled;
+ bool waiting_rq, idle_timer_disabled = false;
spin_lock_irq(&bfqd->lock);
@@ -5189,14 +5205,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx);
-
- idle_timer_disabled =
- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ if (in_serv_queue == bfqd->in_service_queue) {
+ idle_timer_disabled =
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ }
spin_unlock_irq(&bfqd->lock);
-
- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
- idle_timer_disabled);
+ bfq_update_dispatch_stats(hctx->queue, rq,
+ idle_timer_disabled ? in_serv_queue : NULL,
+ idle_timer_disabled);
return rq;
}
@@ -5448,7 +5465,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bfqq = bic_to_bfqq(bic, false);
if (bfqq) {
bfq_release_process_ref(bfqd, bfqq);
- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
+ bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
bic_set_bfqq(bic, bfqq, false);
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 07288b9da389..3b83e3d1c2e5 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -8,7 +8,6 @@
#include <linux/blktrace_api.h>
#include <linux/hrtimer.h>
-#include <linux/blk-cgroup.h>
#include "blk-cgroup-rwstat.h"
@@ -1051,7 +1050,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
for (parent = NULL; entity ; entity = parent)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index b74cc0da118e..f8eb340381cf 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
#ifdef CONFIG_BFQ_GROUP_IOSCHED
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
- return container_of(group_entity, struct bfq_group, entity);
-}
-
/*
* Returns true if this budget changes may let next_in_service->parent
* become the next_in_service entity for its parent entity.
@@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- return bfqq->bfqd->root_group;
-}
-
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
{
return false;
@@ -519,7 +504,7 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
static unsigned short bfq_weight_to_ioprio(int weight)
{
return max_t(int, 0,
- IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+ IOPRIO_NR_LEVELS - weight / BFQ_WEIGHT_CONVERSION_COEFF);
}
static void bfq_get_entity(struct bfq_entity *entity)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 0827b19820c5..32929c89ba8a 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -165,6 +165,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
+ iter.tuple_size = bi->tuple_size;
iter.seed = proc_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
@@ -420,7 +421,6 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
return 0;
}
-EXPORT_SYMBOL(bio_integrity_clone);
int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
diff --git a/block/bio.c b/block/bio.c
index 4312a8085396..4259125e16ab 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -15,7 +15,6 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
-#include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
@@ -24,6 +23,7 @@
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
struct bio_alloc_cache {
struct bio *free_list;
@@ -249,15 +249,14 @@ static void bio_free(struct bio *bio)
* they must remember to pair any call to bio_init() with bio_uninit()
* when IO has completed, or when the bio is released.
*/
-void bio_init(struct bio *bio, struct bio_vec *table,
- unsigned short max_vecs)
+void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
+ unsigned short max_vecs, unsigned int opf)
{
bio->bi_next = NULL;
- bio->bi_bdev = NULL;
- bio->bi_opf = 0;
+ bio->bi_bdev = bdev;
+ bio->bi_opf = opf;
bio->bi_flags = 0;
bio->bi_ioprio = 0;
- bio->bi_write_hint = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -268,6 +267,8 @@ void bio_init(struct bio *bio, struct bio_vec *table,
#ifdef CONFIG_BLK_CGROUP
bio->bi_blkg = NULL;
bio->bi_issue.value = 0;
+ if (bdev)
+ bio_associate_blkg(bio);
#ifdef CONFIG_BLK_CGROUP_IOCOST
bio->bi_iocost_cost = 0;
#endif
@@ -293,6 +294,8 @@ EXPORT_SYMBOL(bio_init);
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
+ * @bdev: block device to use the bio for
+ * @opf: operation and flags for bio
*
* Description:
* After calling bio_reset(), @bio will be in the same state as a freshly
@@ -300,11 +303,15 @@ EXPORT_SYMBOL(bio_init);
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
{
bio_uninit(bio);
memset(bio, 0, BIO_RESET_BYTES);
atomic_set(&bio->__bi_remaining, 1);
+ bio->bi_bdev = bdev;
+ if (bio->bi_bdev)
+ bio_associate_blkg(bio);
+ bio->bi_opf = opf;
}
EXPORT_SYMBOL(bio_reset);
@@ -344,6 +351,20 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, unsigned int opf, gfp_t gfp)
+{
+ struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
+
+ if (bio) {
+ bio_chain(bio, new);
+ submit_bio(bio);
+ }
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(blk_next_bio);
+
static void bio_alloc_rescue(struct work_struct *work)
{
struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
@@ -400,8 +421,10 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
/**
* bio_alloc_bioset - allocate a bio for I/O
+ * @bdev: block device to allocate the bio for (can be %NULL)
+ * @nr_vecs: number of bvecs to pre-allocate
+ * @opf: operation and flags for bio
* @gfp_mask: the GFP_* mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* Allocate a bio from the mempools in @bs.
@@ -430,15 +453,16 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
*
* Returns: Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
+struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
+ unsigned int opf, gfp_t gfp_mask,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
struct bio *bio;
void *p;
- /* should not use nobvec bioset for nr_iovecs > 0 */
- if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
+ /* should not use nobvec bioset for nr_vecs > 0 */
+ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
return NULL;
/*
@@ -475,23 +499,23 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
return NULL;
bio = p + bs->front_pad;
- if (nr_iovecs > BIO_INLINE_VECS) {
+ if (nr_vecs > BIO_INLINE_VECS) {
struct bio_vec *bvl = NULL;
- bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
- bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
}
if (unlikely(!bvl))
goto err_free;
- bio_init(bio, bvl, nr_iovecs);
- } else if (nr_iovecs) {
- bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
+ bio_init(bio, bdev, bvl, nr_vecs, opf);
+ } else if (nr_vecs) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
} else {
- bio_init(bio, NULL, 0);
+ bio_init(bio, bdev, NULL, 0, opf);
}
bio->bi_pool = bs;
@@ -522,7 +546,8 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
if (unlikely(!bio))
return NULL;
- bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
+ bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs,
+ 0);
bio->bi_pool = NULL;
return bio;
}
@@ -702,80 +727,83 @@ void bio_put(struct bio *bio)
}
EXPORT_SYMBOL(bio_put);
-/**
- * __bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: destination bio
- * @bio_src: bio to clone
- *
- * Clone a &bio. Caller will own the returned bio, but not
- * the actual data it points to. Reference count of returned
- * bio will be one.
- *
- * Caller must ensure that @bio_src is not freed before @bio.
- */
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
- WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
-
- /*
- * most users will be overriding ->bi_bdev with a new target,
- * so we don't set nor calculate new physical/hw segment counts here
- */
- bio->bi_bdev = bio_src->bi_bdev;
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
- if (bio_flagged(bio_src, BIO_REMAPPED))
+ if (bio->bi_bdev == bio_src->bi_bdev &&
+ bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
- bio->bi_io_vec = bio_src->bi_io_vec;
bio_clone_blkg_association(bio, bio_src);
blkcg_bio_issue_init(bio);
+
+ if (bio_crypt_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ if (bio_integrity(bio_src) &&
+ bio_integrity_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ return 0;
}
-EXPORT_SYMBOL(__bio_clone_fast);
/**
- * bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
+ * bio_alloc_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
+ * bio, but not the actual data it points to.
*
- * Like __bio_clone_fast, only also allocates the returned bio
+ * The caller must ensure that the return bio is not freed before @bio_src.
*/
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+ gfp_t gfp, struct bio_set *bs)
{
- struct bio *b;
+ struct bio *bio;
- b = bio_alloc_bioset(gfp_mask, 0, bs);
- if (!b)
+ bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
+ if (!bio)
return NULL;
- __bio_clone_fast(b, bio);
-
- if (bio_crypt_clone(b, bio, gfp_mask) < 0)
- goto err_put;
-
- if (bio_integrity(bio) &&
- bio_integrity_clone(b, bio, gfp_mask) < 0)
- goto err_put;
-
- return b;
+ if (__bio_clone(bio, bio_src, gfp) < 0) {
+ bio_put(bio);
+ return NULL;
+ }
+ bio->bi_io_vec = bio_src->bi_io_vec;
-err_put:
- bio_put(b);
- return NULL;
+ return bio;
}
-EXPORT_SYMBOL(bio_clone_fast);
+EXPORT_SYMBOL(bio_alloc_clone);
-const char *bio_devname(struct bio *bio, char *buf)
+/**
+ * bio_init_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio: bio to clone into
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ *
+ * Initialize a new bio in caller provided memory that is a clone of @bio_src.
+ * The caller owns the returned bio, but not the actual data it points to.
+ *
+ * The caller must ensure that @bio_src is not freed before @bio.
+ */
+int bio_init_clone(struct block_device *bdev, struct bio *bio,
+ struct bio *bio_src, gfp_t gfp)
{
- return bdevname(bio->bi_bdev, buf);
+ int ret;
+
+ bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+ ret = __bio_clone(bio, bio_src, gfp);
+ if (ret)
+ bio_uninit(bio);
+ return ret;
}
-EXPORT_SYMBOL(bio_devname);
+EXPORT_SYMBOL(bio_init_clone);
/**
* bio_full - check if the bio is full
@@ -1054,7 +1082,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
if (len > UINT_MAX || off > UINT_MAX)
- return 0;
+ return false;
return bio_add_page(bio, &folio->page, len, off) > 0;
}
@@ -1486,8 +1514,7 @@ again:
if (!bio_integrity_endio(bio))
return;
- if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
- rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
+ rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
@@ -1541,7 +1568,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
return NULL;
- split = bio_clone_fast(bio, gfp, bs);
+ split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
if (!split)
return NULL;
@@ -1571,7 +1598,7 @@ EXPORT_SYMBOL(bio_split);
void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
- offset + size > bio->bi_iter.bi_size))
+ offset + size > bio_sectors(bio)))
return;
size <<= 9;
@@ -1636,9 +1663,9 @@ EXPORT_SYMBOL(bioset_exit);
* Note that the bio must be embedded at the END of that structure always,
* or things will break badly.
* If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
- * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
- * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
- * dispatch queued requests when the mempool runs out of space.
+ * for allocating iovecs. This pool is not needed e.g. for bio_init_clone().
+ * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
+ * to dispatch queued requests when the mempool runs out of space.
*
*/
int bioset_init(struct bio_set *bs,
@@ -1708,7 +1735,9 @@ EXPORT_SYMBOL(bioset_init_from_src);
/**
* bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
* @kiocb: kiocb describing the IO
+ * @bdev: block device to allocate the bio for (can be %NULL)
* @nr_vecs: number of iovecs to pre-allocate
+ * @opf: operation and flags for bio
* @bs: bio_set to allocate from
*
* Description:
@@ -1719,14 +1748,14 @@ EXPORT_SYMBOL(bioset_init_from_src);
* MUST be done from process context, not hard/soft IRQ.
*
*/
-struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
- struct bio_set *bs)
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
+ unsigned short nr_vecs, unsigned int opf, struct bio_set *bs)
{
struct bio_alloc_cache *cache;
struct bio *bio;
if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
- return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
cache = per_cpu_ptr(bs->cache, get_cpu());
if (cache->free_list) {
@@ -1734,13 +1763,14 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
cache->free_list = bio->bi_next;
cache->nr--;
put_cpu();
- bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL,
+ nr_vecs, opf);
bio->bi_pool = bs;
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
put_cpu();
- bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index ee746919c41f..9f2723b34b75 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -6,7 +6,7 @@
#ifndef _BLK_CGROUP_RWSTAT_H
#define _BLK_CGROUP_RWSTAT_H
-#include <linux/blk-cgroup.h>
+#include "blk-cgroup.h"
enum blkg_rwstat_type {
BLKG_RWSTAT_READ,
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 650f7e27989f..8dfe62786cd5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,15 +23,14 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
-#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
-#include <linux/blk-cgroup.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
+#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"
@@ -66,6 +65,23 @@ static bool blkcg_policy_enabled(struct request_queue *q,
return pol && test_bit(pol->plid, q->blkcg_pols);
}
+static void blkg_free_workfn(struct work_struct *work)
+{
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+ free_work);
+ int i;
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++)
+ if (blkg->pd[i])
+ blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+
+ if (blkg->q)
+ blk_put_queue(blkg->q);
+ free_percpu(blkg->iostat_cpu);
+ percpu_ref_exit(&blkg->refcnt);
+ kfree(blkg);
+}
+
/**
* blkg_free - free a blkg
* @blkg: blkg to free
@@ -74,18 +90,15 @@ static bool blkcg_policy_enabled(struct request_queue *q,
*/
static void blkg_free(struct blkcg_gq *blkg)
{
- int i;
-
if (!blkg)
return;
- for (i = 0; i < BLKCG_MAX_POLS; i++)
- if (blkg->pd[i])
- blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
-
- free_percpu(blkg->iostat_cpu);
- percpu_ref_exit(&blkg->refcnt);
- kfree(blkg);
+ /*
+ * Both ->pd_free_fn() and request queue's release handler may
+ * sleep, so free us by scheduling one work func
+ */
+ INIT_WORK(&blkg->free_work, blkg_free_workfn);
+ schedule_work(&blkg->free_work);
}
static void __blkg_release(struct rcu_head *rcu)
@@ -168,6 +181,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg->iostat_cpu)
goto err_free;
+ if (!blk_get_queue(q))
+ goto err_free;
+
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock);
@@ -857,11 +873,11 @@ static void blkcg_fill_root_iostats(void)
blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp;
int cpu;
+ unsigned long flags;
memset(&tmp, 0, sizeof(tmp));
for_each_possible_cpu(cpu) {
struct disk_stats *cpu_dkstats;
- unsigned long flags;
cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
tmp.ios[BLKG_IOSTAT_READ] +=
@@ -877,11 +893,11 @@ static void blkcg_fill_root_iostats(void)
cpu_dkstats->sectors[STAT_WRITE] << 9;
tmp.bytes[BLKG_IOSTAT_DISCARD] +=
cpu_dkstats->sectors[STAT_DISCARD] << 9;
-
- flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
- blkg_iostat_set(&blkg->iostat.cur, &tmp);
- u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
+
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
}
@@ -1176,6 +1192,8 @@ int blkcg_init_queue(struct request_queue *q)
bool preloaded;
int ret;
+ INIT_LIST_HEAD(&q->blkg_list);
+
new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..47e1e38390c9
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,494 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_CGROUP_PRIVATE_H
+#define _BLK_CGROUP_PRIVATE_H
+/*
+ * block cgroup private header
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/blk-cgroup.h>
+#include <linux/blk-mq.h>
+
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
+
+#ifdef CONFIG_BLK_CGROUP
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q). This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods. A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
+ */
+struct blkg_policy_data {
+ /* the blkg and policy id this per-policy data belongs to */
+ struct blkcg_gq *blkg;
+ int plid;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods. A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
+ */
+struct blkcg_policy_data {
+ /* the blkcg and policy id this per-policy data belongs to */
+ struct blkcg *blkcg;
+ int plid;
+};
+
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
+ struct request_queue *q, struct blkcg *blkcg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
+ struct seq_file *s);
+
+struct blkcg_policy {
+ int plid;
+ /* cgroup files for the policy */
+ struct cftype *dfl_cftypes;
+ struct cftype *legacy_cftypes;
+
+ /* operations */
+ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
+ blkcg_pol_init_cpd_fn *cpd_init_fn;
+ blkcg_pol_free_cpd_fn *cpd_free_fn;
+ blkcg_pol_bind_cpd_fn *cpd_bind_fn;
+
+ blkcg_pol_alloc_pd_fn *pd_alloc_fn;
+ blkcg_pol_init_pd_fn *pd_init_fn;
+ blkcg_pol_online_pd_fn *pd_online_fn;
+ blkcg_pol_offline_pd_fn *pd_offline_fn;
+ blkcg_pol_free_pd_fn *pd_free_fn;
+ blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
+ blkcg_pol_stat_pd_fn *pd_stat_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern bool blkcg_debug_stats;
+
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+ struct request_queue *q, bool update_hint);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol);
+
+const char *blkg_dev_name(struct blkcg_gq *blkg);
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+ u64 (*prfill)(struct seq_file *,
+ struct blkg_policy_data *, int),
+ const struct blkcg_policy *pol, int data,
+ bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+
+struct blkg_conf_ctx {
+ struct block_device *bdev;
+ struct blkcg_gq *blkg;
+ char *body;
+};
+
+struct block_device *blkcg_conf_open_bdev(char **inputp);
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static inline struct cgroup_subsys_state *blkcg_css(void)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kthread_blkcg();
+ if (css)
+ return css;
+ return task_css(current, io_cgrp_id);
+}
+
+/**
+ * __bio_blkcg - internal, inconsistent version to get blkcg
+ *
+ * DO NOT USE.
+ * This function is inconsistent and consequently is dangerous to use. The
+ * first part of the function returns a blkcg where a reference is owned by the
+ * bio. This means it does not need to be rcu protected as it cannot go away
+ * with the bio owning a reference to it. However, the latter potentially gets
+ * it from task_css(). This can race against task migration and the cgroup
+ * dying. It is also semantically different as it must be called rcu protected
+ * and is susceptible to failure when trying to get a reference to it.
+ * Therefore, it is not ok to assume that *_get() will always succeed on the
+ * blkcg returned here.
+ */
+static inline struct blkcg *__bio_blkcg(struct bio *bio)
+{
+ if (bio && bio->bi_blkg)
+ return bio->bi_blkg->blkcg;
+ return css_to_blkcg(blkcg_css());
+}
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg. The idea is we do bio_blkcg() to look up the actual context for the
+ * bio and attach the appropriate blkg to the bio. Then we call this helper and
+ * if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+ return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state. If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q,
+ bool update_hint)
+{
+ struct blkcg_gq *blkg;
+
+ if (blkcg == &blkcg_root)
+ return q->root_blkg;
+
+ blkg = rcu_dereference(blkcg->blkg_hint);
+ if (blkg && blkg->q == q)
+ return blkg;
+
+ return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair. This function should be called
+ * under RCU read lock.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return __blkg_lookup(blkcg, q, false);
+}
+
+/**
+ * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for @q at the root level. See also blkg_lookup().
+ */
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{
+ return q->root_blkg;
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol)
+{
+ return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+ struct blkcg_policy *pol)
+{
+ return blkcg ? blkcg->cpd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data. Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+ return pd ? pd->blkg : NULL;
+}
+
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+ return cpd ? cpd->blkcg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+ return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+ percpu_ref_get(&blkg->refcnt);
+}
+
+/**
+ * blkg_tryget - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg. We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
+{
+ return blkg && percpu_ref_tryget(&blkg->refcnt);
+}
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+ percpu_ref_put(&blkg->refcnt);
+}
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
+ * read locked. If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs. The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead. Synchronization rules are the same. @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q, false)))
+
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+ if (bio->bi_opf & REQ_CGROUP_PUNT)
+ return __blkcg_punt_bio_submit(bio);
+ else
+ return false;
+}
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+ bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+ return;
+ if (atomic_add_return(1, &blkg->use_delay) == 1)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ if (WARN_ON_ONCE(old < 0))
+ return 0;
+ if (old == 0)
+ return 0;
+
+ /*
+ * We do this song and dance because we can race with somebody else
+ * adding or removing delay. If we just did an atomic_dec we'd end up
+ * negative and we'd already be in trouble. We need to subtract 1 and
+ * then check to see if we were the last delay so we can drop the
+ * congestion count on the cgroup.
+ */
+ while (old) {
+ int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+ if (cur == old)
+ break;
+ old = cur;
+ }
+
+ if (old == 0)
+ return 0;
+ if (old == 1)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ return 1;
+}
+
+/**
+ * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
+ * @blkg: target blkg
+ * @delay: delay duration in nsecs
+ *
+ * When enabled with this function, the delay is not decayed and must be
+ * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
+ * blkcg_[un]use_delay() and blkcg_add_delay() usages.
+ */
+static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person setting the congestion count for this blkg. */
+ if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+
+ atomic64_set(&blkg->delay_nsec, delay);
+}
+
+/**
+ * blkcg_clear_delay - Disable allocator delay mechanism
+ * @blkg: target blkg
+ *
+ * Disable use_delay mechanism. See blkcg_set_delay().
+ */
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person clearing the congestion count for this blkg. */
+ if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+/**
+ * blk_cgroup_mergeable - Determine whether to allow or disallow merges
+ * @rq: request to merge into
+ * @bio: bio to merge
+ *
+ * @bio and @rq should belong to the same cgroup and their issue_as_root should
+ * match. The latter is necessary as we don't want to throttle e.g. a metadata
+ * update because it happens to be next to a regular IO.
+ */
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
+{
+ return rq->bio->bi_blkg == bio->bi_blkg &&
+ bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
+}
+
+void blk_cgroup_bio_start(struct bio *bio);
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+#else /* CONFIG_BLK_CGROUP */
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_policy {
+};
+
+#ifdef CONFIG_BLOCK
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
+{ return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+ const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
+static inline void blk_cgroup_bio_start(struct bio *bio) { }
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
+
+#define blk_queue_for_each_rl(rl, q) \
+ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BLK_CGROUP */
+
+#endif /* _BLK_CGROUP_PRIVATE_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 779b4a1f66ac..bc0506772152 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,7 +34,6 @@
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
@@ -49,8 +48,8 @@
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
+#include "blk-cgroup.h"
#include "blk-throttle.h"
-#include "blk-rq-qos.h"
struct dentry *blk_debugfs_root;
@@ -123,7 +122,6 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_CLOSE),
REQ_OP_NAME(ZONE_FINISH),
REQ_OP_NAME(ZONE_APPEND),
- REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
@@ -165,6 +163,7 @@ static const struct {
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
+ [BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
/* device mapper special case, should not leak out: */
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
@@ -315,9 +314,6 @@ void blk_cleanup_queue(struct request_queue *q)
*/
blk_freeze_queue(q);
- /* cleanup rq qos structures for queue without disk */
- rq_qos_exit(q);
-
blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
blk_sync_queue(q);
@@ -339,8 +335,6 @@ void blk_cleanup_queue(struct request_queue *q)
blk_mq_sched_free_rqs(q);
mutex_unlock(&q->sysfs_lock);
- percpu_ref_exit(&q->q_usage_counter);
-
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
}
@@ -473,9 +467,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
-#ifdef CONFIG_BLK_CGROUP
- INIT_LIST_HEAD(&q->blkg_list);
-#endif
kobject_init(&q->kobj, &blk_queue_ktype);
@@ -496,17 +487,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
goto fail_stats;
- if (blkcg_init_queue(q))
- goto fail_ref;
-
blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
-fail_ref:
- percpu_ref_exit(&q->q_usage_counter);
fail_stats:
blk_free_queue_stats(q->stats);
fail_split:
@@ -540,17 +526,6 @@ bool blk_get_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_get_queue);
-static void handle_bad_sector(struct bio *bio, sector_t maxsector)
-{
- char b[BDEVNAME_SIZE];
-
- pr_info_ratelimited("%s: attempt to access beyond end of device\n"
- "%s: rw=%d, want=%llu, limit=%llu\n",
- current->comm,
- bio_devname(bio, b), bio->bi_opf,
- bio_end_sector(bio), maxsector);
-}
-
#ifdef CONFIG_FAIL_MAKE_REQUEST
static DECLARE_FAULT_ATTR(fail_make_request);
@@ -580,14 +555,10 @@ late_initcall(fail_make_request_debugfs);
static inline bool bio_check_ro(struct bio *bio)
{
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
- char b[BDEVNAME_SIZE];
-
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return false;
-
- WARN_ONCE(1,
- "Trying to write to read-only block-device %s (partno %d)\n",
- bio_devname(bio, b), bio->bi_bdev->bd_partno);
+ pr_warn("Trying to write to read-only block-device %pg\n",
+ bio->bi_bdev);
/* Older lvm-tools actually trigger this */
return false;
}
@@ -616,7 +587,11 @@ static inline int bio_check_eod(struct bio *bio)
if (nr_sectors && maxsector &&
(nr_sectors > maxsector ||
bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
- handle_bad_sector(bio, maxsector);
+ pr_info_ratelimited("%s: attempt to access beyond end of device\n"
+ "%pg: rw=%d, want=%llu, limit=%llu\n",
+ current->comm,
+ bio->bi_bdev, bio->bi_opf,
+ bio_end_sector(bio), maxsector);
return -EIO;
}
return 0;
@@ -676,134 +651,19 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_OK;
}
-noinline_for_stack bool submit_bio_checks(struct bio *bio)
-{
- struct block_device *bdev = bio->bi_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- blk_status_t status = BLK_STS_IOERR;
- struct blk_plug *plug;
-
- might_sleep();
-
- plug = blk_mq_plug(q, bio);
- if (plug && plug->nowait)
- bio->bi_opf |= REQ_NOWAIT;
-
- /*
- * For a REQ_NOWAIT based request, return -EOPNOTSUPP
- * if queue does not support NOWAIT.
- */
- if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
- goto not_supported;
-
- if (should_fail_bio(bio))
- goto end_io;
- if (unlikely(bio_check_ro(bio)))
- goto end_io;
- if (!bio_flagged(bio, BIO_REMAPPED)) {
- if (unlikely(bio_check_eod(bio)))
- goto end_io;
- if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
- goto end_io;
- }
-
- /*
- * Filter flush bio's early so that bio based drivers without flush
- * support don't have to worry about them.
- */
- if (op_is_flush(bio->bi_opf) &&
- !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
- bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
- if (!bio_sectors(bio)) {
- status = BLK_STS_OK;
- goto end_io;
- }
- }
-
- if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- bio_clear_polled(bio);
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- if (!blk_queue_discard(q))
- goto not_supported;
- break;
- case REQ_OP_SECURE_ERASE:
- if (!blk_queue_secure_erase(q))
- goto not_supported;
- break;
- case REQ_OP_WRITE_SAME:
- if (!q->limits.max_write_same_sectors)
- goto not_supported;
- break;
- case REQ_OP_ZONE_APPEND:
- status = blk_check_zone_append(q, bio);
- if (status != BLK_STS_OK)
- goto end_io;
- break;
- case REQ_OP_ZONE_RESET:
- case REQ_OP_ZONE_OPEN:
- case REQ_OP_ZONE_CLOSE:
- case REQ_OP_ZONE_FINISH:
- if (!blk_queue_is_zoned(q))
- goto not_supported;
- break;
- case REQ_OP_ZONE_RESET_ALL:
- if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
- goto not_supported;
- break;
- case REQ_OP_WRITE_ZEROES:
- if (!q->limits.max_write_zeroes_sectors)
- goto not_supported;
- break;
- default:
- break;
- }
-
- if (blk_throtl_bio(bio))
- return false;
-
- blk_cgroup_bio_start(bio);
- blkcg_bio_issue_init(bio);
-
- if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
- trace_block_bio_queue(bio);
- /* Now that enqueuing has been traced, we need to trace
- * completion as well.
- */
- bio_set_flag(bio, BIO_TRACE_COMPLETION);
- }
- return true;
-
-not_supported:
- status = BLK_STS_NOTSUPP;
-end_io:
- bio->bi_status = status;
- bio_endio(bio);
- return false;
-}
-
-static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
-{
- if (blk_crypto_bio_prep(&bio)) {
- if (likely(bio_queue_enter(bio) == 0)) {
- disk->fops->submit_bio(bio);
- blk_queue_exit(disk->queue);
- }
- }
-}
-
static void __submit_bio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
- if (unlikely(!submit_bio_checks(bio)))
+ if (unlikely(!blk_crypto_bio_prep(&bio)))
return;
- if (!disk->fops->submit_bio)
+ if (!disk->fops->submit_bio) {
blk_mq_submit_bio(bio);
- else
- __submit_bio_fops(disk, bio);
+ } else if (likely(bio_queue_enter(bio) == 0)) {
+ disk->fops->submit_bio(bio);
+ blk_queue_exit(disk->queue);
+ }
}
/*
@@ -823,7 +683,7 @@ static void __submit_bio(struct bio *bio)
*
* bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
* bio_list_on_stack[1] contains bios that were submitted before the current
- * ->submit_bio_bio, but that haven't been processed yet.
+ * ->submit_bio, but that haven't been processed yet.
*/
static void __submit_bio_noacct(struct bio *bio)
{
@@ -882,16 +742,7 @@ static void __submit_bio_noacct_mq(struct bio *bio)
current->bio_list = NULL;
}
-/**
- * submit_bio_noacct - re-submit a bio to the block device layer for I/O
- * @bio: The bio describing the location in memory and on the device.
- *
- * This is a version of submit_bio() that shall only be used for I/O that is
- * resubmitted to lower level drivers by stacking block drivers. All file
- * systems and other upper level users of the block layer should use
- * submit_bio() instead.
- */
-void submit_bio_noacct(struct bio *bio)
+void submit_bio_noacct_nocheck(struct bio *bio)
{
/*
* We only want one ->submit_bio to be active at a time, else stack
@@ -906,6 +757,118 @@ void submit_bio_noacct(struct bio *bio)
else
__submit_bio_noacct(bio);
}
+
+/**
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
+ * @bio: The bio describing the location in memory and on the device.
+ *
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
+ */
+void submit_bio_noacct(struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ blk_status_t status = BLK_STS_IOERR;
+ struct blk_plug *plug;
+
+ might_sleep();
+
+ plug = blk_mq_plug(q, bio);
+ if (plug && plug->nowait)
+ bio->bi_opf |= REQ_NOWAIT;
+
+ /*
+ * For a REQ_NOWAIT based request, return -EOPNOTSUPP
+ * if queue does not support NOWAIT.
+ */
+ if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
+ goto not_supported;
+
+ if (should_fail_bio(bio))
+ goto end_io;
+ if (unlikely(bio_check_ro(bio)))
+ goto end_io;
+ if (!bio_flagged(bio, BIO_REMAPPED)) {
+ if (unlikely(bio_check_eod(bio)))
+ goto end_io;
+ if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+ goto end_io;
+ }
+
+ /*
+ * Filter flush bio's early so that bio based drivers without flush
+ * support don't have to worry about them.
+ */
+ if (op_is_flush(bio->bi_opf) &&
+ !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+ bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
+ if (!bio_sectors(bio)) {
+ status = BLK_STS_OK;
+ goto end_io;
+ }
+ }
+
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ bio_clear_polled(bio);
+
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ if (!blk_queue_discard(q))
+ goto not_supported;
+ break;
+ case REQ_OP_SECURE_ERASE:
+ if (!blk_queue_secure_erase(q))
+ goto not_supported;
+ break;
+ case REQ_OP_ZONE_APPEND:
+ status = blk_check_zone_append(q, bio);
+ if (status != BLK_STS_OK)
+ goto end_io;
+ break;
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_OPEN:
+ case REQ_OP_ZONE_CLOSE:
+ case REQ_OP_ZONE_FINISH:
+ if (!blk_queue_is_zoned(q))
+ goto not_supported;
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+ goto not_supported;
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ if (!q->limits.max_write_zeroes_sectors)
+ goto not_supported;
+ break;
+ default:
+ break;
+ }
+
+ if (blk_throtl_bio(bio))
+ return;
+
+ blk_cgroup_bio_start(bio);
+ blkcg_bio_issue_init(bio);
+
+ if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_queue(bio);
+ /* Now that enqueuing has been traced, we need to trace
+ * completion as well.
+ */
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
+ }
+ submit_bio_noacct_nocheck(bio);
+ return;
+
+not_supported:
+ status = BLK_STS_NOTSUPP;
+end_io:
+ bio->bi_status = status;
+ bio_endio(bio);
+}
EXPORT_SYMBOL(submit_bio_noacct);
/**
@@ -931,13 +894,7 @@ void submit_bio(struct bio *bio)
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
- unsigned int count;
-
- if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
- count = queue_logical_block_size(
- bdev_get_queue(bio->bi_bdev)) >> 9;
- else
- count = bio_sectors(bio);
+ unsigned int count = bio_sectors(bio);
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
@@ -983,21 +940,24 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
- int ret;
+ int ret = 0;
if (cookie == BLK_QC_T_NONE ||
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- if (current->plug)
- blk_flush_plug(current->plug, false);
+ blk_flush_plug(current->plug, false);
if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT))
return 0;
- if (WARN_ON_ONCE(!queue_is_mq(q)))
- ret = 0; /* not yet implemented, should not happen */
- else
+ if (queue_is_mq(q)) {
ret = blk_mq_poll(q, cookie, iob, flags);
+ } else {
+ struct gendisk *disk = q->disk;
+
+ if (disk && disk->fops->poll_bio)
+ ret = disk->fops->poll_bio(bio, iob, flags);
+ }
blk_queue_exit(q);
return ret;
}
@@ -1272,7 +1232,7 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
}
EXPORT_SYMBOL(blk_check_plugged);
-void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
if (!list_empty(&plug->cb_list))
flush_plug_callbacks(plug, from_schedule);
@@ -1301,7 +1261,7 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
void blk_finish_plug(struct blk_plug *plug)
{
if (plug == current->plug) {
- blk_flush_plug(plug, false);
+ __blk_flush_plug(plug, false);
current->plug = NULL;
}
}
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c87aba8584c6..7c854584b52b 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -10,7 +10,6 @@
#define pr_fmt(fmt) "blk-crypto-fallback: " fmt
#include <crypto/skcipher.h>
-#include <linux/blk-cgroup.h>
#include <linux/blk-crypto.h>
#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
@@ -20,6 +19,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
+#include "blk-cgroup.h"
#include "blk-crypto-internal.h"
static unsigned int num_prealloc_bounce_pg = 32;
@@ -170,7 +170,6 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 2fb0d65a464c..e6818ffaddbf 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -11,6 +11,7 @@
/* Represents a crypto mode supported by blk-crypto */
struct blk_crypto_mode {
+ const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
unsigned int ivsize; /* iv size in bytes */
@@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+int blk_crypto_sysfs_register(struct request_queue *q);
+
+void blk_crypto_sysfs_unregister(struct request_queue *q);
+
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@@ -62,6 +67,13 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+static inline int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ return 0;
+}
+
+static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)
{
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
new file mode 100644
index 000000000000..fd93bd2f33b7
--- /dev/null
+++ b/block/blk-crypto-sysfs.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ *
+ * sysfs support for blk-crypto. This file contains the code which exports the
+ * crypto capabilities of devices via /sys/block/$disk/queue/crypto/.
+ */
+
+#include <linux/blk-crypto-profile.h>
+
+#include "blk-crypto-internal.h"
+
+struct blk_crypto_kobj {
+ struct kobject kobj;
+ struct blk_crypto_profile *profile;
+};
+
+struct blk_crypto_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page);
+};
+
+static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
+{
+ return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
+}
+
+static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
+{
+ return container_of(attr, struct blk_crypto_attr, attr);
+}
+
+static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
+}
+
+static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", profile->num_slots);
+}
+
+#define BLK_CRYPTO_RO_ATTR(_name) \
+ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+
+BLK_CRYPTO_RO_ATTR(max_dun_bits);
+BLK_CRYPTO_RO_ATTR(num_keyslots);
+
+static struct attribute *blk_crypto_attrs[] = {
+ &max_dun_bits_attr.attr,
+ &num_keyslots_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group blk_crypto_attr_group = {
+ .attrs = blk_crypto_attrs,
+};
+
+/*
+ * The encryption mode attributes. To avoid hard-coding the list of encryption
+ * modes, these are initialized at boot time by blk_crypto_sysfs_init().
+ */
+static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
+static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
+
+static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+ int mode_num = a - __blk_crypto_mode_attrs;
+
+ if (profile->modes_supported[mode_num])
+ return 0444;
+ return 0;
+}
+
+static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ int mode_num = attr - __blk_crypto_mode_attrs;
+
+ return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]);
+}
+
+static const struct attribute_group blk_crypto_modes_attr_group = {
+ .name = "modes",
+ .attrs = blk_crypto_mode_attrs,
+ .is_visible = blk_crypto_mode_is_visible,
+};
+
+static const struct attribute_group *blk_crypto_attr_groups[] = {
+ &blk_crypto_attr_group,
+ &blk_crypto_modes_attr_group,
+ NULL,
+};
+
+static ssize_t blk_crypto_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *page)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ return a->show(profile, a, page);
+}
+
+static const struct sysfs_ops blk_crypto_attr_ops = {
+ .show = blk_crypto_attr_show,
+};
+
+static void blk_crypto_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct blk_crypto_kobj, kobj));
+}
+
+static struct kobj_type blk_crypto_ktype = {
+ .default_groups = blk_crypto_attr_groups,
+ .sysfs_ops = &blk_crypto_attr_ops,
+ .release = blk_crypto_release,
+};
+
+/*
+ * If the request_queue has a blk_crypto_profile, create the "crypto"
+ * subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
+ */
+int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ struct blk_crypto_kobj *obj;
+ int err;
+
+ if (!q->crypto_profile)
+ return 0;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ obj->profile = q->crypto_profile;
+
+ err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
+ "crypto");
+ if (err) {
+ kobject_put(&obj->kobj);
+ return err;
+ }
+ q->crypto_kobject = &obj->kobj;
+ return 0;
+}
+
+void blk_crypto_sysfs_unregister(struct request_queue *q)
+{
+ kobject_put(q->crypto_kobject);
+}
+
+static int __init blk_crypto_sysfs_init(void)
+{
+ int i;
+
+ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+ for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+ struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i];
+
+ attr->attr.name = blk_crypto_modes[i].name;
+ attr->attr.mode = 0444;
+ attr->show = blk_crypto_mode_show;
+ blk_crypto_mode_attrs[i - 1] = &attr->attr;
+ }
+ return 0;
+}
+subsys_initcall(blk_crypto_sysfs_init);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index ec9efeeeca91..a496aaef85ba 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -19,16 +19,19 @@
const struct blk_crypto_mode blk_crypto_modes[] = {
[BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+ .name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+ .name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
+ .name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
.ivsize = 32,
@@ -111,7 +114,6 @@ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
*dst->bi_crypt_context = *src->bi_crypt_context;
return 0;
}
-EXPORT_SYMBOL_GPL(__bio_crypt_clone);
/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e4df894189ce..c68968724870 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -460,9 +460,7 @@ int blkdev_issue_flush(struct block_device *bdev)
{
struct bio bio;
- bio_init(&bio, NULL, 0);
- bio_set_dev(&bio, bdev);
- bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 11f49f78db32..df9cfe4ca532 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -280,7 +280,6 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
task_lock(task);
if (task->flags & PF_EXITING) {
- err = -ESRCH;
kmem_cache_free(iocontext_cachep, ioc);
goto out;
}
@@ -292,7 +291,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
task->io_context->ioprio = ioprio;
out:
task_unlock(task);
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 769b64394298..9bd670999d0a 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -178,12 +178,12 @@
#include <linux/time64.h>
#include <linux/parser.h>
#include <linux/sched/signal.h>
-#include <linux/blk-cgroup.h>
#include <asm/local.h>
#include <asm/local64.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
#ifdef CONFIG_TRACEPOINTS
@@ -2322,7 +2322,17 @@ static void ioc_timer_fn(struct timer_list *timer)
iocg->hweight_donating = hwa;
iocg->hweight_after_donation = new_hwi;
list_add(&iocg->surplus_list, &surpluses);
- } else {
+ } else if (!iocg->abs_vdebt) {
+ /*
+ * @iocg doesn't have enough to donate. Reset
+ * its inuse to active.
+ *
+ * Don't reset debtors as their inuse's are
+ * owned by debt handling. This shouldn't affect
+ * donation calculuation in any meaningful way
+ * as @iocg doesn't have a meaningful amount of
+ * share anyway.
+ */
TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
iocg->inuse, iocg->active,
iocg->hweight_inuse, new_hwi);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 6593c7123b97..2f33932e72e3 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -74,9 +74,9 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
+#include "blk-cgroup.h"
#include "blk.h"
#define DEFAULT_SCALE_COOKIE 1000000U
@@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
int inflight = 0;
blkg = bio->bi_blkg;
- if (!blkg || !bio_flagged(bio, BIO_TRACKED))
+ if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
return;
iolat = blkg_to_lat(bio->bi_blkg);
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 2e7f10e1c03f..79e797f5d194 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -12,11 +12,11 @@
* Documentation/admin-guide/cgroup-v2.rst.
*/
-#include <linux/blk-cgroup.h>
#include <linux/blk-mq.h>
#include <linux/blk_types.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-rq-qos.h"
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9f09beadcbe3..237d60d8b585 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,19 +10,6 @@
#include "blk.h"
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
-{
- struct bio *new = bio_alloc(gfp, nr_pages);
-
- if (bio) {
- bio_chain(bio, new);
- submit_bio(bio);
- }
-
- return new;
-}
-EXPORT_SYMBOL_GPL(blk_next_bio);
-
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop)
@@ -32,9 +19,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
unsigned int op;
sector_t bs_mask, part_offset = 0;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
@@ -95,11 +79,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, op, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, op, 0);
-
bio->bi_iter.bi_size = req_sects << 9;
sector += req_sects;
nr_sects -= req_sects;
@@ -151,109 +132,12 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);
-/**
- * __blkdev_issue_write_same - generate number of bios with same page
- * @bdev: target blockdev
- * @sector: start sector
- * @nr_sects: number of sectors to write
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @page: page containing data to write
- * @biop: pointer to anchor bio
- *
- * Description:
- * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
- */
-static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, struct page *page,
- struct bio **biop)
-{
- struct request_queue *q = bdev_get_queue(bdev);
- unsigned int max_write_same_sectors;
- struct bio *bio = *biop;
- sector_t bs_mask;
-
- if (!q)
- return -ENXIO;
-
- if (bdev_read_only(bdev))
- return -EPERM;
-
- bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- if ((sector | nr_sects) & bs_mask)
- return -EINVAL;
-
- if (!bdev_write_same(bdev))
- return -EOPNOTSUPP;
-
- /* Ensure that max_write_same_sectors doesn't overflow bi_size */
- max_write_same_sectors = bio_allowed_max_sectors(q);
-
- while (nr_sects) {
- bio = blk_next_bio(bio, 1, gfp_mask);
- bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio->bi_vcnt = 1;
- bio->bi_io_vec->bv_page = page;
- bio->bi_io_vec->bv_offset = 0;
- bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
-
- if (nr_sects > max_write_same_sectors) {
- bio->bi_iter.bi_size = max_write_same_sectors << 9;
- nr_sects -= max_write_same_sectors;
- sector += max_write_same_sectors;
- } else {
- bio->bi_iter.bi_size = nr_sects << 9;
- nr_sects = 0;
- }
- cond_resched();
- }
-
- *biop = bio;
- return 0;
-}
-
-/**
- * blkdev_issue_write_same - queue a write same operation
- * @bdev: target blockdev
- * @sector: start sector
- * @nr_sects: number of sectors to write
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @page: page containing data
- *
- * Description:
- * Issue a write same request for the sectors in question.
- */
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask,
- struct page *page)
-{
- struct bio *bio = NULL;
- struct blk_plug plug;
- int ret;
-
- blk_start_plug(&plug);
- ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
- &bio);
- if (ret == 0 && bio) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
- }
- blk_finish_plug(&plug);
- return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_write_same);
-
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
- struct request_queue *q = bdev_get_queue(bdev);
-
- if (!q)
- return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
@@ -265,10 +149,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
return -EOPNOTSUPP;
while (nr_sects) {
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE_ZEROES;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
@@ -304,23 +186,17 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop)
{
- struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
int bi_size = 0;
unsigned int sz;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) {
- bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
- gfp_mask);
+ bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects),
+ REQ_OP_WRITE, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4de34a332c9f..7771dacc99cb 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,6 +9,7 @@
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
@@ -152,22 +153,6 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
}
-static struct bio *blk_bio_write_same_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *nsegs)
-{
- *nsegs = 1;
-
- if (!q->limits.max_write_same_sectors)
- return NULL;
-
- if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
- return NULL;
-
- return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
-}
-
/*
* Return the maximum number of sectors from the start of a bio that may be
* submitted as a single request to a block device. If enough sectors remain,
@@ -351,10 +336,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
nr_segs);
break;
- case REQ_OP_WRITE_SAME:
- split = blk_bio_write_same_split(q, *bio, &q->bio_split,
- nr_segs);
- break;
default:
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
break;
@@ -368,8 +349,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
trace_block_split(split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
*bio = split;
-
- blk_throtl_charge_bio_split(*bio);
}
}
@@ -416,8 +395,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return 1;
case REQ_OP_WRITE_ZEROES:
return 0;
- case REQ_OP_WRITE_SAME:
- return 1;
}
rq_for_each_bvec(bv, rq, iter)
@@ -555,8 +532,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
- else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
- nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg);
else if (rq->bio)
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
@@ -600,6 +575,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
+ if (!blk_cgroup_mergeable(req, bio))
+ goto no_merge;
+
if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
@@ -696,6 +674,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (total_phys_segments > blk_rq_get_max_segments(req))
return 0;
+ if (!blk_cgroup_mergeable(req, next->bio))
+ return 0;
+
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
@@ -757,13 +738,6 @@ static enum elv_merge blk_try_req_merge(struct request *req,
return ELEVATOR_NO_MERGE;
}
-static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
-{
- if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
- return true;
- return false;
-}
-
/*
* For non-mq, this has to be called with the request spinlock acquired.
* For mq with scheduling, the appropriate queue wide lock should be held.
@@ -780,17 +754,6 @@ static struct request *attempt_merge(struct request_queue *q,
if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
- if (req_op(req) == REQ_OP_WRITE_SAME &&
- !blk_write_same_mergeable(req->bio, next->bio))
- return NULL;
-
- /*
- * Don't allow merge of different write hints, or for a hint with
- * non-hint IO.
- */
- if (req->write_hint != next->write_hint)
- return NULL;
-
if (req->ioprio != next->ioprio)
return NULL;
@@ -904,6 +867,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
+ /* don't merge across cgroup boundaries */
+ if (!blk_cgroup_mergeable(rq, bio))
+ return false;
+
/* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
@@ -912,18 +879,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
- /* must be using the same buffer */
- if (req_op(rq) == REQ_OP_WRITE_SAME &&
- !blk_write_same_mergeable(rq->bio, bio))
- return false;
-
- /*
- * Don't allow merge of different write hints, or for a hint with
- * non-hint IO.
- */
- if (rq->write_hint != bio->bi_write_hint)
- return false;
-
if (rq->ioprio != bio_prio(bio))
return false;
@@ -1089,12 +1044,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
if (!plug || rq_list_empty(plug->mq_list))
return false;
- /* check the previously added entry for a quick merge attempt */
- rq = rq_list_peek(&plug->mq_list);
- if (rq->q == q) {
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q == q) {
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
+ }
+
+ /*
+ * Only keep iterating plug list for merges if we have multiple
+ * queues
+ */
+ if (!plug->multiple_queues)
+ break;
}
return false;
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3a790eb4995c..aa0349e9f083 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -183,35 +183,11 @@ inval:
return count;
}
-static int queue_write_hint_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- int i;
-
- for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
- seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]);
-
- return 0;
-}
-
-static ssize_t queue_write_hint_store(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct request_queue *q = data;
- int i;
-
- for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
- q->write_hints[i] = 0;
-
- return count;
-}
-
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "poll_stat", 0400, queue_poll_stat_show },
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
- { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
{ },
};
@@ -707,7 +683,7 @@ static void debugfs_create_files(struct dentry *parent, void *data,
void blk_mq_debugfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
@@ -780,7 +756,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
void blk_mq_debugfs_register_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_register_hctx(q, hctx);
@@ -789,7 +765,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q)
void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_unregister_hctx(hctx);
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a68aa6041a10..69918f4170d6 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -6,6 +6,8 @@
#include <linux/seq_file.h>
+struct blk_mq_hw_ctx;
+
struct blk_mq_debugfs_attr {
const char *name;
umode_t mode;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 55488ba97823..9e56a69422b6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -180,11 +180,18 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
+ unsigned long end = jiffies + HZ;
int ret;
do {
ret = __blk_mq_do_dispatch_sched(hctx);
- } while (ret == 1);
+ if (ret != 1)
+ break;
+ if (need_resched() || time_is_before_jiffies(end)) {
+ blk_mq_delay_run_hw_queue(hctx, 0);
+ break;
+ }
+ } while (1);
return ret;
}
@@ -515,7 +522,7 @@ static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) {
@@ -550,9 +557,10 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
- unsigned int i, flags = q->tag_set->flags;
+ unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
+ unsigned long i;
int ret;
if (!e) {
@@ -618,7 +626,7 @@ err_free_map_and_rqs:
void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
@@ -635,7 +643,7 @@ void blk_mq_sched_free_rqs(struct request_queue *q)
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 674786574075..c08426975856 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -206,7 +206,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -255,7 +255,8 @@ void blk_mq_sysfs_init(struct request_queue *q)
int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int ret, i;
+ unsigned long i, j;
+ int ret;
WARN_ON_ONCE(!q->kobj.parent);
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -278,8 +279,10 @@ out:
return ret;
unreg:
- while (--i >= 0)
- blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+ queue_for_each_hw_ctx(q, hctx, j) {
+ if (j < i)
+ blk_mq_unregister_hctx(hctx);
+ }
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
@@ -290,7 +293,7 @@ unreg:
void blk_mq_sysfs_unregister(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
@@ -306,7 +309,8 @@ unlock:
int blk_mq_sysfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i, ret = 0;
+ unsigned long i;
+ int ret = 0;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 845f74e8dd7b..68ac23d0b640 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -107,7 +107,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
return BLK_MQ_NO_TAG;
if (data->shallow_depth)
- return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+ return sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
return __sbitmap_queue_get(bt);
}
@@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
/*
- * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
+ * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
* while the queue is frozen. So we can use q_usage_counter to avoid
* racing with it.
*/
@@ -515,7 +515,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
bt_for_each(NULL, q, btags, fn, priv, false);
} else {
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9a9185a0a2d1..84d749511f55 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
- return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+ return xa_load(&q->hctx_table,
+ (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
}
static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
@@ -312,7 +313,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
@@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
- data.hctx = q->queue_hw_ctx[hctx_idx];
+ data.hctx = xa_load(&q->hctx_table, hctx_idx);
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -793,8 +794,11 @@ bool blk_update_request(struct request *req, blk_status_t error,
#endif
if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)))
+ !(req->rq_flags & RQF_QUIET)) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
blk_print_req_error(req, error);
+ trace_block_rq_error(req, error, nr_bytes);
+ }
blk_account_io_completion(req, nr_bytes);
@@ -885,10 +889,15 @@ static inline void blk_account_io_done(struct request *req, u64 now)
static void __blk_account_io_start(struct request *rq)
{
- /* passthrough requests can hold bios that do not have ->bi_bdev set */
- if (rq->bio && rq->bio->bi_bdev)
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (rq->bio)
rq->part = rq->bio->bi_bdev;
- else if (rq->q->disk)
+ else
rq->part = rq->q->disk->part0;
part_stat_lock();
@@ -1122,14 +1131,7 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- u64 start_time;
-#ifdef CONFIG_BLK_CGROUP
- if (rq->bio)
- start_time = bio_issue_time(&rq->bio->bi_issue);
- else
-#endif
- start_time = ktime_get_ns();
- rq->io_start_time_ns = start_time;
+ rq->io_start_time_ns = ktime_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
@@ -1444,7 +1446,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
container_of(work, struct request_queue, timeout_work);
unsigned long next = 0;
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
/* A deadlock might occur if a request is stuck requiring a
* timeout at the same time a queue freeze is waiting
@@ -2145,7 +2147,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
- int i;
+ unsigned long i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
@@ -2173,7 +2175,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
- int i;
+ unsigned long i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
@@ -2182,6 +2184,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
if (blk_mq_hctx_stopped(hctx))
continue;
/*
+ * If there is already a run_work pending, leave the
+ * pending delay untouched. Otherwise, a hctx can stall
+ * if another hctx is re-delaying the other's work
+ * before the work executes.
+ */
+ if (delayed_work_pending(&hctx->run_work))
+ continue;
+ /*
* Dispatch from this hctx either if there's no hctx preferred
* by IO scheduler or if it has requests that bypass the
* scheduler.
@@ -2203,7 +2213,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
bool blk_mq_queue_stopped(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hctx_stopped(hctx))
@@ -2242,7 +2252,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue);
void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_stop_hw_queue(hctx);
@@ -2260,7 +2270,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue);
void blk_mq_start_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_hw_queue(hctx);
@@ -2280,7 +2290,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_stopped_hw_queue(hctx, async);
@@ -2396,7 +2406,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
- rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
@@ -2561,13 +2570,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q,
q->mq_ops->queue_rqs(&plug->mq_list);
}
+static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+{
+ struct blk_mq_hw_ctx *this_hctx = NULL;
+ struct blk_mq_ctx *this_ctx = NULL;
+ struct request *requeue_list = NULL;
+ unsigned int depth = 0;
+ LIST_HEAD(list);
+
+ do {
+ struct request *rq = rq_list_pop(&plug->mq_list);
+
+ if (!this_hctx) {
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ rq_list_add(&requeue_list, rq);
+ continue;
+ }
+ list_add_tail(&rq->queuelist, &list);
+ depth++;
+ } while (!rq_list_empty(plug->mq_list));
+
+ plug->mq_list = requeue_list;
+ trace_block_unplug(this_hctx->queue, depth, !from_sched);
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched);
+}
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct blk_mq_hw_ctx *this_hctx;
- struct blk_mq_ctx *this_ctx;
struct request *rq;
- unsigned int depth;
- LIST_HEAD(list);
if (rq_list_empty(plug->mq_list))
return;
@@ -2603,35 +2635,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
return;
}
- this_hctx = NULL;
- this_ctx = NULL;
- depth = 0;
do {
- rq = rq_list_pop(&plug->mq_list);
-
- if (!this_hctx) {
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
- } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
- trace_block_unplug(this_hctx->queue, depth,
- !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx,
- &list, from_schedule);
- depth = 0;
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
-
- }
-
- list_add(&rq->queuelist, &list);
- depth++;
+ blk_mq_dispatch_plug_list(plug, from_schedule);
} while (!rq_list_empty(plug->mq_list));
-
- if (!list_empty(&list)) {
- trace_block_unplug(this_hctx->queue, depth, !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
- from_schedule);
- }
}
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2804,9 +2810,6 @@ void blk_mq_submit_bio(struct bio *bio)
unsigned int nr_segs = 1;
blk_status_t ret;
- if (unlikely(!blk_crypto_bio_prep(&bio)))
- return;
-
blk_queue_bounce(q, &bio);
if (blk_may_split(q, bio))
__blk_queue_split(q, &bio, &nr_segs);
@@ -2853,27 +2856,16 @@ void blk_mq_submit_bio(struct bio *bio)
blk_mq_try_issue_directly(rq->mq_hctx, rq));
}
+#ifdef CONFIG_BLK_MQ_STACKING
/**
- * blk_cloned_rq_check_limits - Helper function to check a cloned request
- * for the new queue limits
- * @q: the queue
- * @rq: the request being checked
- *
- * Description:
- * @rq may have been made based on weaker limitations of upper-level queues
- * in request stacking drivers, and it may violate the limitation of @q.
- * Since the block layer and the underlying device driver trust @rq
- * after it is inserted to @q, it should be checked against @q before
- * the insertion using this generic function.
- *
- * Request stacking drivers like request-based dm may change the queue
- * limits when retrying requests on other queues. Those requests need
- * to be checked against the new queue limits again during dispatch.
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @rq: the request being queued
*/
-static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
- struct request *rq)
+blk_status_t blk_insert_cloned_request(struct request *rq)
{
+ struct request_queue *q = rq->q;
unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ blk_status_t ret;
if (blk_rq_sectors(rq) > max_sectors) {
/*
@@ -2905,24 +2897,7 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
return BLK_STS_IOERR;
}
- return BLK_STS_OK;
-}
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q: the queue to submit the request
- * @rq: the request being queued
- */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
-{
- blk_status_t ret;
-
- ret = blk_cloned_rq_check_limits(q, rq);
- if (ret != BLK_STS_OK)
- return ret;
-
- if (rq->q->disk &&
- should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
+ if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
if (blk_crypto_insert_cloned_request(rq))
@@ -2935,7 +2910,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
- blk_mq_run_dispatch_ops(rq->q,
+ blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true));
if (ret)
blk_account_io_done(rq, ktime_get_ns());
@@ -2990,10 +2965,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_clone_fast(bio_src, gfp_mask, bs);
+ bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
+ bs);
if (!bio)
goto free_and_out;
- bio->bi_bdev = rq->q->disk->part0;
if (bio_ctr && bio_ctr(bio, bio_src, data))
goto free_and_out;
@@ -3030,6 +3005,7 @@ free_and_out:
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+#endif /* CONFIG_BLK_MQ_STACKING */
/*
* Steal bios from a request and add them to a bio list.
@@ -3100,6 +3076,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
struct blk_mq_tags *drv_tags;
struct page *page;
+ if (list_empty(&tags->page_list))
+ return;
+
if (blk_mq_is_shared_tags(set->flags))
drv_tags = set->shared_tags;
else
@@ -3142,15 +3121,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
blk_mq_free_tags(tags);
}
+static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ int i;
+
+ for (i = 0; i < set->nr_maps; i++) {
+ unsigned int start = set->map[i].queue_offset;
+ unsigned int end = start + set->map[i].nr_queues;
+
+ if (hctx_idx >= start && hctx_idx < end)
+ break;
+ }
+
+ if (i >= set->nr_maps)
+ i = HCTX_TYPE_DEFAULT;
+
+ return i;
+}
+
+static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
+
+ return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
+}
+
static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags)
{
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
struct blk_mq_tags *tags;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -3199,10 +3204,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth)
{
unsigned int i, j, entries_per_page, max_order = 4;
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
size_t rq_size, left;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -3447,6 +3451,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_mq_remove_cpuhp(hctx);
+ xa_erase(&q->hctx_table, hctx_idx);
+
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
@@ -3456,12 +3462,11 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set, int nr_queue)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
- blk_mq_debugfs_unregister_hctx(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
}
}
@@ -3486,8 +3491,15 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
+
+ if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
+ goto exit_flush_rq;
+
return 0;
+ exit_flush_rq:
+ if (set->ops->exit_request)
+ set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -3647,7 +3659,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
static void blk_mq_map_swqueue(struct request_queue *q)
{
- unsigned int i, j, hctx_idx;
+ unsigned int j, hctx_idx;
+ unsigned long i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
@@ -3754,7 +3767,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (shared) {
@@ -3854,7 +3867,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
void blk_mq_release(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx, *next;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -3865,7 +3878,7 @@ void blk_mq_release(struct request_queue *q)
kobject_put(&hctx->kobj);
}
- kfree(q->queue_hw_ctx);
+ xa_destroy(&q->hctx_table);
/*
* release .mq_kobj and sw queue's kobject now because
@@ -3954,52 +3967,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- int i, j, end;
- struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
-
- if (q->nr_hw_queues < set->nr_hw_queues) {
- struct blk_mq_hw_ctx **new_hctxs;
-
- new_hctxs = kcalloc_node(set->nr_hw_queues,
- sizeof(*new_hctxs), GFP_KERNEL,
- set->numa_node);
- if (!new_hctxs)
- return;
- if (hctxs)
- memcpy(new_hctxs, hctxs, q->nr_hw_queues *
- sizeof(*hctxs));
- q->queue_hw_ctx = new_hctxs;
- kfree(hctxs);
- hctxs = new_hctxs;
- }
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i, j;
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
- int node;
- struct blk_mq_hw_ctx *hctx;
+ int old_node;
+ int node = blk_mq_get_hctx_node(set, i);
+ struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
- /*
- * If the hw queue has been mapped to another numa node,
- * we need to realloc the hctx. If allocation fails, fallback
- * to use the previous one.
- */
- if (hctxs[i] && (hctxs[i]->numa_node == node))
- continue;
+ if (old_hctx) {
+ old_node = old_hctx->numa_node;
+ blk_mq_exit_hctx(q, set, old_hctx, i);
+ }
- hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
- if (hctx) {
- if (hctxs[i])
- blk_mq_exit_hctx(q, set, hctxs[i], i);
- hctxs[i] = hctx;
- } else {
- if (hctxs[i])
- pr_warn("Allocate new hctx on node %d fails,\
- fallback to previous one on node %d\n",
- node, hctxs[i]->numa_node);
- else
+ if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+ if (!old_hctx)
break;
+ pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
+ node, old_node);
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
+ WARN_ON_ONCE(!hctx);
}
}
/*
@@ -4008,24 +3997,27 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
- end = i;
} else {
j = i;
- end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
- for (; j < end; j++) {
- struct blk_mq_hw_ctx *hctx = hctxs[j];
-
- if (hctx) {
- blk_mq_exit_hctx(q, set, hctx, j);
- hctxs[j] = NULL;
- }
- }
+ xa_for_each_start(&q->hctx_table, j, hctx, j)
+ blk_mq_exit_hctx(q, set, hctx, j);
mutex_unlock(&q->sysfs_lock);
}
+static void blk_mq_update_poll_flag(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ if (set->nr_maps > HCTX_TYPE_POLL &&
+ set->map[HCTX_TYPE_POLL].nr_queues)
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+}
+
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
@@ -4050,6 +4042,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
+ xa_init(&q->hctx_table);
+
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@@ -4060,9 +4054,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
- if (set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+ blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
@@ -4081,7 +4073,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
return 0;
err_hctxs:
- kfree(q->queue_hw_ctx);
+ xa_destroy(&q->hctx_table);
q->nr_hw_queues = 0;
blk_mq_sysfs_deinit(q);
err_poll:
@@ -4369,7 +4361,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
- int i, ret;
+ int ret;
+ unsigned long i;
if (!set)
return -EINVAL;
@@ -4463,21 +4456,28 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
return true;
}
-static void blk_mq_elv_switch_back(struct list_head *head,
- struct request_queue *q)
+static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
+ struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
- struct elevator_type *t = NULL;
list_for_each_entry(qe, head, node)
- if (qe->q == q) {
- t = qe->type;
- break;
- }
+ if (qe->q == q)
+ return qe;
- if (!t)
- return;
+ return NULL;
+}
+
+static void blk_mq_elv_switch_back(struct list_head *head,
+ struct request_queue *q)
+{
+ struct blk_mq_qe_pair *qe;
+ struct elevator_type *t;
+ qe = blk_lookup_qe_pair(head, q);
+ if (!qe)
+ return;
+ t = qe->type;
list_del(&qe->node);
kfree(qe);
@@ -4528,6 +4528,7 @@ fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
+ blk_mq_update_poll_flag(q);
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@@ -4744,7 +4745,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q)
{
if (queue_is_mq(q)) {
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
cancel_delayed_work_sync(&q->requeue_work);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 948791ea2a3e..2615bd58bad3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -83,7 +83,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
enum hctx_type type,
unsigned int cpu)
{
- return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
+ return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}
static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 3cfbc8668cba..68267007da1c 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
__rq_qos_requeue(q->rq_qos, rq);
}
-static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+static inline void rq_qos_done_bio(struct bio *bio)
{
- if (q->rq_qos)
- __rq_qos_done_bio(q->rq_qos, bio);
+ if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+ bio_flagged(bio, BIO_QOS_MERGED))) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ if (q->rq_qos)
+ __rq_qos_done_bio(q->rq_qos, bio);
+ }
}
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- /*
- * BIO_TRACKED lets controllers know that a bio went through the
- * normal rq_qos path.
- */
if (q->rq_qos) {
- bio_set_flag(bio, BIO_TRACKED);
+ bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
}
@@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (q->rq_qos) {
+ bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
+ }
}
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b880c70e22e4..b83df3d2eebc 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -42,7 +42,6 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
- lim->max_write_same_sectors = 0;
lim->max_write_zeroes_sectors = 0;
lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
@@ -79,7 +78,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
- lim->max_write_same_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
}
@@ -179,18 +177,6 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
/**
- * blk_queue_max_write_same_sectors - set max sectors for a single write same
- * @q: the request queue for the device
- * @max_write_same_sectors: maximum number of sectors to write per command
- **/
-void blk_queue_max_write_same_sectors(struct request_queue *q,
- unsigned int max_write_same_sectors)
-{
- q->limits.max_write_same_sectors = max_write_same_sectors;
-}
-EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
-
-/**
* blk_queue_max_write_zeroes_sectors - set max sectors for a single
* write zeroes
* @q: the request queue for the device
@@ -519,8 +505,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
- t->max_write_same_sectors = min(t->max_write_same_sectors,
- b->max_write_same_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
t->max_zone_append_sectors = min(t->max_zone_append_sectors,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9f32882ceb2f..88bd41d4cb59 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -10,7 +10,6 @@
#include <linux/backing-dev.h>
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
#include <linux/debugfs.h>
#include "blk.h"
@@ -18,6 +17,7 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
#include "blk-throttle.h"
struct queue_sysfs_entry {
@@ -214,8 +214,7 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
{
- return sprintf(page, "%llu\n",
- (unsigned long long)q->limits.max_write_same_sectors << 9);
+ return queue_var_show(0, page);
}
static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
@@ -739,27 +738,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
-static void blk_exit_queue(struct request_queue *q)
-{
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- ioc_clear_queue(q);
- elevator_exit(q);
- }
-
- /*
- * Remove all references to @q from the block cgroup controller before
- * restoring @q->queue_lock to avoid that restoring this pointer causes
- * e.g. blkcg_print_blkgs() to crash.
- */
- blkcg_exit_queue(q);
-}
-
/**
* blk_release_queue - releases all allocated resources of the request_queue
* @kobj: pointer to a kobject, whose container is a request_queue
@@ -787,12 +765,12 @@ static void blk_release_queue(struct kobject *kobj)
might_sleep();
+ percpu_ref_exit(&q->q_usage_counter);
+
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
- blk_exit_queue(q);
-
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
@@ -880,6 +858,10 @@ int blk_register_queue(struct gendisk *disk)
goto put_dev;
}
+ ret = blk_crypto_sysfs_register(q);
+ if (ret)
+ goto put_dev;
+
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
@@ -910,6 +892,7 @@ unlock:
return ret;
put_dev:
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
@@ -954,16 +937,18 @@ void blk_unregister_queue(struct gendisk *disk)
*/
if (queue_is_mq(q))
blk_mq_unregister_dev(disk_to_dev(disk), q);
-
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
+ blk_crypto_sysfs_unregister(q);
blk_trace_remove_sysfs(disk_to_dev(disk));
mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- elv_unregister_queue(q);
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
+
+ /* Now that we've deleted all child objects, we can delete the queue. */
+ kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ kobject_del(&q->kobj);
+
mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 7c462c006b26..469c483719be 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,7 +10,6 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
-#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
#include "blk-stat.h"
@@ -42,11 +41,6 @@
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
-enum tg_state_flags {
- THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
- THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
-};
-
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
/* We measure latency for request size from <= 4k to >= 1M */
@@ -426,12 +420,24 @@ static void tg_update_has_rules(struct throtl_grp *tg)
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
struct throtl_data *td = tg->td;
int rw;
+ int has_iops_limit = 0;
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ unsigned int iops_limit = tg_iops_limit(tg, rw);
- for (rw = READ; rw <= WRITE; rw++)
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
(td->limit_valid[td->limit_index] &&
(tg_bps_limit(tg, rw) != U64_MAX ||
- tg_iops_limit(tg, rw) != UINT_MAX));
+ iops_limit != UINT_MAX));
+
+ if (iops_limit != UINT_MAX)
+ has_iops_limit = 1;
+ }
+
+ if (has_iops_limit)
+ tg->flags |= THROTL_TG_HAS_IOPS_LIMIT;
+ else
+ tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT;
}
static void throtl_pd_online(struct blkg_policy_data *pd)
@@ -634,8 +640,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
- atomic_set(&tg->io_split_cnt[rw], 0);
-
/*
* Previous slice has expired. We must have trimmed it after last
* bio dispatch. That means since start of last slice, we never used
@@ -659,8 +663,6 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
- atomic_set(&tg->io_split_cnt[rw], 0);
-
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -808,7 +810,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
- if (bps_limit == U64_MAX) {
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) {
if (wait)
*wait = 0;
return true;
@@ -871,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
+ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
+ tg->flags & THROTL_TG_CANCELING) {
if (wait)
*wait = 0;
return true;
@@ -893,9 +897,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
- if (iops_limit != UINT_MAX)
- tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
-
if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
@@ -920,9 +921,12 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
- tg->bytes_disp[rw] += bio_size;
+ if (!bio_flagged(bio, BIO_THROTTLED)) {
+ tg->bytes_disp[rw] += bio_size;
+ tg->last_bytes_disp[rw] += bio_size;
+ }
+
tg->io_disp[rw]++;
- tg->last_bytes_disp[rw] += bio_size;
tg->last_io_disp[rw]++;
/*
@@ -1134,12 +1138,22 @@ static void throtl_pending_timer_fn(struct timer_list *t)
struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
- struct request_queue *q = td->queue;
struct throtl_service_queue *parent_sq;
+ struct request_queue *q;
bool dispatched;
int ret;
+ /* throtl_data may be gone, so figure out request queue by blkg */
+ if (tg)
+ q = tg->pd.blkg->q;
+ else
+ q = td->queue;
+
spin_lock_irq(&q->queue_lock);
+
+ if (!q->root_blkg)
+ goto out_unlock;
+
if (throtl_can_upgrade(td, NULL))
throtl_upgrade_state(td);
@@ -1219,7 +1233,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bio_list_on_stack)))
- submit_bio_noacct(bio);
+ submit_bio_noacct_nocheck(bio);
blk_finish_plug(&plug);
}
}
@@ -1763,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
return false;
}
+void blk_throtl_cancel_bios(struct request_queue *q)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ spin_lock_irq(&q->queue_lock);
+ /*
+ * queue_lock is held, rcu lock is not needed here technically.
+ * However, rcu lock is still held to emphasize that following
+ * path need RCU protection and to prevent warning from lockdep.
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ /*
+ * Set the flag to make sure throtl_pending_timer_fn() won't
+ * stop until all throttled bios are dispatched.
+ */
+ blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
+ /*
+ * Update disptime after setting the above flag to make sure
+ * throtl_select_dispatch() won't exit without dispatching.
+ */
+ tg_update_disptime(tg);
+
+ throtl_schedule_pending_timer(sq, jiffies + 1);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&q->queue_lock);
+}
+
static bool throtl_can_upgrade(struct throtl_data *td,
struct throtl_grp *this_tg)
{
@@ -1917,14 +1964,12 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
}
if (tg->iops[READ][LIMIT_LOW]) {
- tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
iops = tg->last_io_disp[READ] * HZ / elapsed_time;
if (iops >= tg->iops[READ][LIMIT_LOW])
tg->last_low_overflow_time[READ] = now;
}
if (tg->iops[WRITE][LIMIT_LOW]) {
- tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
if (iops >= tg->iops[WRITE][LIMIT_LOW])
tg->last_low_overflow_time[WRITE] = now;
@@ -2043,25 +2088,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
}
#endif
-void blk_throtl_charge_bio_split(struct bio *bio)
-{
- struct blkcg_gq *blkg = bio->bi_blkg;
- struct throtl_grp *parent = blkg_to_tg(blkg);
- struct throtl_service_queue *parent_sq;
- bool rw = bio_data_dir(bio);
-
- do {
- if (!parent->has_rules[rw])
- break;
-
- atomic_inc(&parent->io_split_cnt[rw]);
- atomic_inc(&parent->last_io_split_cnt[rw]);
-
- parent_sq = parent->service_queue.parent_sq;
- parent = sq_to_tg(parent_sq);
- } while (parent);
-}
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 175f03abd9e4..c1b602996127 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -52,6 +52,13 @@ struct throtl_service_queue {
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
+enum tg_state_flags {
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+ THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */
+ THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
+};
+
enum {
LIMIT_LOW,
LIMIT_MAX,
@@ -132,9 +139,6 @@ struct throtl_grp {
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
- atomic_t io_split_cnt[2];
- atomic_t last_io_split_cnt[2];
-
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
@@ -158,20 +162,23 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
-static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+static inline void blk_throtl_cancel_bios(struct request_queue *q) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct request_queue *q);
void blk_throtl_exit(struct request_queue *q);
void blk_throtl_register_queue(struct request_queue *q);
-void blk_throtl_charge_bio_split(struct bio *bio);
bool __blk_throtl_bio(struct bio *bio);
+void blk_throtl_cancel_bios(struct request_queue *q);
static inline bool blk_throtl_bio(struct bio *bio)
{
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
- if (bio_flagged(bio, BIO_THROTTLED))
+ /* no need to throttle bps any more if the bio has been throttled */
+ if (bio_flagged(bio, BIO_THROTTLED) &&
+ !(tg->flags & THROTL_TG_HAS_IOPS_LIMIT))
return false;
+
if (!tg->has_rules[bio_data_dir(bio)])
return false;
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 2eb01becde8c..7e44eccc676d 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -101,9 +101,6 @@ u64 wbt_default_latency_nsec(struct request_queue *);
#else
-static inline void wbt_track(struct request *rq, enum wbt_flags flags)
-{
-}
static inline int wbt_init(struct request_queue *q)
{
return -EINVAL;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 774ecc598bee..38cd840d8838 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -65,7 +65,6 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
switch (req_op(rq)) {
case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE_SAME:
case REQ_OP_WRITE:
return blk_rq_zone_is_seq(rq);
default:
@@ -215,9 +214,8 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
continue;
}
- bio = blk_next_bio(bio, 0, gfp_mask);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
+ gfp_mask);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@@ -239,10 +237,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
{
struct bio bio;
- bio_init(&bio, NULL, 0);
- bio_set_dev(&bio, bdev);
- bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
-
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
return submit_bio_wait(&bio);
}
@@ -306,9 +301,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
}
while (sector < end_sector) {
- bio = blk_next_bio(bio, 0, gfp_mask);
- bio_set_dev(bio, bdev);
- bio->bi_opf = op | REQ_SYNC;
+ bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
diff --git a/block/blk.h b/block/blk.h
index 8bd43b3ad33d..8ccbc6e07636 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -46,7 +46,7 @@ void blk_freeze_queue(struct request_queue *q);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
-bool submit_bio_checks(struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
@@ -286,7 +286,6 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE_SAME:
return true; /* non-trivial splitting decisions */
default:
break;
@@ -325,7 +324,7 @@ int blk_dev_init(void);
*/
static inline bool blk_do_io_stat(struct request *rq)
{
- return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
+ return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
}
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
@@ -406,8 +405,6 @@ extern int blk_iolatency_init(struct request_queue *q);
static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
#endif
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
-
#ifdef CONFIG_BLK_DEV_ZONED
void blk_queue_free_zone_bitmaps(struct request_queue *q);
void blk_queue_clear_zone_settings(struct request_queue *q);
@@ -426,6 +423,7 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
int bdev_del_partition(struct gendisk *disk, int partno);
int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
+void blk_drop_partitions(struct gendisk *disk);
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
@@ -445,6 +443,9 @@ int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
+void disk_block_events(struct gendisk *disk);
+void disk_unblock_events(struct gendisk *disk);
+void disk_flush_events(struct gendisk *disk, unsigned int mask);
extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
diff --git a/block/bounce.c b/block/bounce.c
index 7af1a72835b9..467be46d0e65 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -14,7 +14,6 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
-#include <linux/blk-cgroup.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
@@ -24,6 +23,7 @@
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-cgroup.h"
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
@@ -162,17 +162,13 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
* that does not own the bio - reason being drivers don't use it for
* iterating over the biovec anymore, so expecting it to be kept up
* to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
+ * asking for trouble and would force extra work.
*/
- bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src),
- &bounce_bio_set);
- bio->bi_bdev = bio_src->bi_bdev;
+ bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
+ bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
@@ -181,9 +177,6 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
break;
- case REQ_OP_WRITE_SAME:
- bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
- break;
default:
bio_for_each_segment(bv, bio_src, iter)
bio->bi_io_vec[bio->bi_vcnt++] = bv;
diff --git a/block/disk-events.c b/block/disk-events.c
index 8d5496e7592a..aee25a7e1ab7 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -4,7 +4,7 @@
*/
#include <linux/export.h>
#include <linux/moduleparam.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include "blk.h"
struct disk_events {
diff --git a/block/elevator.c b/block/elevator.c
index 482df2a350fc..c319765892bb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,7 +35,6 @@
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
@@ -44,6 +43,7 @@
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -192,6 +192,9 @@ void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
+ ioc_clear_queue(q);
+ blk_mq_sched_free_rqs(q);
+
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
@@ -516,9 +519,11 @@ int elv_register_queue(struct request_queue *q, bool uevent)
void elv_unregister_queue(struct request_queue *q)
{
+ struct elevator_queue *e = q->elevator;
+
lockdep_assert_held(&q->sysfs_lock);
- if (q) {
+ if (e && e->registered) {
struct elevator_queue *e = q->elevator;
kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -591,11 +596,7 @@ int elevator_switch_mq(struct request_queue *q,
lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) {
- if (q->elevator->registered)
- elv_unregister_queue(q);
-
- ioc_clear_queue(q);
- blk_mq_sched_free_rqs(q);
+ elv_unregister_queue(q);
elevator_exit(q);
}
@@ -606,7 +607,6 @@ int elevator_switch_mq(struct request_queue *q,
if (new_e) {
ret = elv_register_queue(q, true);
if (ret) {
- blk_mq_sched_free_rqs(q);
elevator_exit(q);
goto out;
}
diff --git a/block/fops.c b/block/fops.c
index a18e7fbd97b8..9f2ecec406b0 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -75,10 +75,14 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
return -ENOMEM;
}
- bio_init(&bio, vecs, nr_pages);
- bio_set_dev(&bio, bdev);
+ if (iov_iter_rw(iter) == READ) {
+ bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
+ if (iter_is_iovec(iter))
+ should_dirty = true;
+ } else {
+ bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
+ }
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
- bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
bio.bi_end_io = blkdev_bio_end_io_simple;
bio.bi_ioprio = iocb->ki_ioprio;
@@ -88,14 +92,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
goto out;
ret = bio.bi_iter.bi_size;
- if (iov_iter_rw(iter) == READ) {
- bio.bi_opf = REQ_OP_READ;
- if (iter_is_iovec(iter))
- should_dirty = true;
- } else {
- bio.bi_opf = dio_bio_write_op(iocb);
+ if (iov_iter_rw(iter) == WRITE)
task_io_account_write(ret);
- }
+
if (iocb->ki_flags & IOCB_NOWAIT)
bio.bi_opf |= REQ_NOWAIT;
if (iocb->ki_flags & IOCB_HIPRI)
@@ -190,6 +189,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
struct blkdev_dio *dio;
struct bio *bio;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+ unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
loff_t pos = iocb->ki_pos;
int ret = 0;
@@ -197,7 +197,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+ bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
atomic_set(&dio->ref, 1);
@@ -223,9 +223,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
for (;;) {
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
- bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -238,11 +236,9 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
if (is_read) {
- bio->bi_opf = REQ_OP_READ;
if (dio->flags & DIO_SHOULD_DIRTY)
bio_set_pages_dirty(bio);
} else {
- bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -258,7 +254,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
atomic_inc(&dio->ref);
submit_bio(bio);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
}
blk_finish_plug(&plug);
@@ -313,6 +309,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
unsigned int nr_pages)
{
struct block_device *bdev = iocb->ki_filp->private_data;
+ bool is_read = iov_iter_rw(iter) == READ;
+ unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
@@ -322,13 +320,11 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
+ bio = bio_alloc_kiocb(iocb, bdev, nr_pages, opf, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->flags = 0;
dio->iocb = iocb;
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
- bio->bi_write_hint = iocb->ki_hint;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -349,14 +345,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
}
dio->size = bio->bi_iter.bi_size;
- if (iov_iter_rw(iter) == READ) {
- bio->bi_opf = REQ_OP_READ;
+ if (is_read) {
if (iter_is_iovec(iter)) {
dio->flags |= DIO_SHOULD_DIRTY;
bio_set_pages_dirty(bio);
}
} else {
- bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
@@ -431,7 +425,8 @@ static int blkdev_writepages(struct address_space *mapping,
}
const struct address_space_operations def_blk_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = blkdev_readpage,
.readahead = blkdev_readahead,
.writepage = blkdev_writepage,
diff --git a/block/genhd.c b/block/genhd.c
index 9eca1f7d35c9..b8b6759d670f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,7 +8,6 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
@@ -26,10 +25,12 @@
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include <linux/part_stat.h>
+#include "blk-throttle.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
static struct kobject *block_depr;
@@ -185,7 +186,9 @@ static struct blk_major_name {
struct blk_major_name *next;
int major;
char name[16];
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
void (*probe)(dev_t devt);
+#endif
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
static DEFINE_MUTEX(major_names_lock);
static DEFINE_SPINLOCK(major_names_spinlock);
@@ -275,7 +278,9 @@ int __register_blkdev(unsigned int major, const char *name,
}
p->major = major;
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
p->probe = probe;
+#endif
strlcpy(p->name, name, sizeof(p->name));
p->next = NULL;
index = major_to_index(major);
@@ -330,7 +335,7 @@ int blk_alloc_ext_minor(void)
{
int idx;
- idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
+ idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL);
if (idx == -ENOSPC)
return -EBUSY;
return idx;
@@ -407,6 +412,10 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
struct device *ddev = disk_to_dev(disk);
int ret;
+ /* Only makes sense for bio-based to set ->poll_bio */
+ if (queue_is_mq(disk->queue) && disk->fops->poll_bio)
+ return -EINVAL;
+
/*
* The disk queue should now be all set with enough information about
* the device for the elevator code to pick an adequate default
@@ -523,6 +532,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
disk_update_readahead(disk);
disk_add_events(disk);
+ set_bit(GD_ADDED, &disk->state);
return 0;
out_unregister_bdi:
@@ -636,7 +646,8 @@ void del_gendisk(struct gendisk *disk)
blk_mq_freeze_queue_wait(q);
- rq_qos_exit(q);
+ blk_throtl_cancel_bios(disk->queue);
+
blk_sync_queue(q);
blk_flush_integrity();
/*
@@ -693,6 +704,7 @@ static ssize_t disk_badblocks_store(struct device *dev,
return badblocks_store(disk->bb, page, len, 0);
}
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
void blk_request_module(dev_t devt)
{
unsigned int major = MAJOR(devt);
@@ -712,6 +724,7 @@ void blk_request_module(dev_t devt)
/* Make old-style 2.4 aliases work */
request_module("block-major-%d", MAJOR(devt));
}
+#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
/*
* print a full list of all partitions - intended for places where the root
@@ -927,12 +940,17 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
- part_stat_read_all(bdev, &stat);
if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, bdev);
else
inflight = part_in_flight(bdev);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(bdev, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(bdev, &stat);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -1100,6 +1118,31 @@ static const struct attribute_group *disk_attr_groups[] = {
NULL
};
+static void disk_release_mq(struct request_queue *q)
+{
+ blk_mq_cancel_work_sync(q);
+
+ /*
+ * There can't be any non non-passthrough bios in flight here, but
+ * requests stay around longer, including passthrough ones so we
+ * still need to freeze the queue here.
+ */
+ blk_mq_freeze_queue(q);
+
+ /*
+ * Since the I/O scheduler exit code may access cgroup information,
+ * perform I/O scheduler exit before disassociating from the block
+ * cgroup controller.
+ */
+ if (q->elevator) {
+ mutex_lock(&q->sysfs_lock);
+ elevator_exit(q);
+ mutex_unlock(&q->sysfs_lock);
+ }
+ rq_qos_exit(q);
+ __blk_mq_unfreeze_queue(q, true);
+}
+
/**
* disk_release - releases all allocated resources of the gendisk
* @dev: the device representing this disk
@@ -1121,13 +1164,21 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
- blk_mq_cancel_work_sync(disk->queue);
+ if (queue_is_mq(disk->queue))
+ disk_release_mq(disk->queue);
+
+ blkcg_exit_queue(disk->queue);
disk_release_events(disk);
kfree(disk->random);
xa_destroy(&disk->part_tbl);
+
disk->queue->disk = NULL;
blk_put_queue(disk->queue);
+
+ if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
+ disk->fops->free_disk(disk);
+
iput(disk->part0->bd_inode); /* frees the disk */
}
@@ -1188,12 +1239,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- part_stat_read_all(hd, &stat);
if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd);
else
inflight = part_in_flight(hd);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(hd, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(hd, &stat);
seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
@@ -1322,6 +1378,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
+ if (blkcg_init_queue(q))
+ goto out_erase_part0;
+
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
@@ -1334,6 +1393,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#endif
return disk;
+out_erase_part0:
+ xa_erase(&disk->part_tbl, 0);
out_destroy_part_tbl:
xa_destroy(&disk->part_tbl);
disk->part0->bd_disk = NULL;
diff --git a/block/holder.c b/block/holder.c
index 27cddce1b446..8d750281a1cd 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
struct bd_holder_disk {
diff --git a/block/ioctl.c b/block/ioctl.c
index 4a86340133e4..f8703db99c73 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -629,7 +629,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return compat_put_long(argp,
(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE:
- if (bdev_nr_sectors(bdev) > ~0UL)
+ if (bdev_nr_sectors(bdev) > ~(compat_ulong_t)0)
return -EFBIG;
return compat_put_ulong(argp, bdev_nr_sectors(bdev));
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 3ed5eaf3446a..6ed602b2f80a 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -742,6 +742,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
if (at_head) {
list_add(&rq->queuelist, &per_prio->dispatch);
+ rq->fifo_time = jiffies;
} else {
deadline_add_rq_rb(per_prio, rq);
diff --git a/block/partitions/check.h b/block/partitions/check.h
index d5b28e309d64..4ffa2359b1a3 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pagemap.h>
#include <linux/blkdev.h>
-#include <linux/genhd.h>
#include "../blk.h"
/*
diff --git a/block/partitions/core.c b/block/partitions/core.c
index c2a1635922b1..2ef8dfa1e5c8 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -8,7 +8,6 @@
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/vmalloc.h>
#include <linux/blktrace_api.h>
#include <linux/raid/detect.h>
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index 8cc2b88d0aa8..84b9f36b9e47 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -13,7 +13,6 @@
#include <linux/types.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index 8693704dcf5e..0a747a0c782d 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -14,7 +14,6 @@
#include <linux/types.h>
#include <linux/list.h>
-#include <linux/genhd.h>
#include <linux/fs.h>
#include <asm/unaligned.h>
#include <asm/byteorder.h>
diff --git a/block/sed-opal.c b/block/sed-opal.c
index daafadbb88ca..9700197000f2 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -13,7 +13,7 @@
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <uapi/linux/sed-opal.h>
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 25a52a2a09a8..914d8cddd43a 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -7,8 +7,10 @@
#include <linux/t10-pi.h>
#include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h>
+#include <linux/crc64.h>
#include <linux/module.h>
#include <net/checksum.h>
+#include <asm/unaligned.h>
typedef __be16 (csum_fn) (void *, unsigned int);
@@ -44,7 +46,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
pi->ref_tag = 0;
iter->data_buf += iter->interval;
- iter->prot_buf += sizeof(struct t10_pi_tuple);
+ iter->prot_buf += iter->tuple_size;
iter->seed++;
}
@@ -93,7 +95,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += sizeof(struct t10_pi_tuple);
+ iter->prot_buf += iter->tuple_size;
iter->seed++;
}
@@ -278,4 +280,196 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
};
EXPORT_SYMBOL(t10_pi_type3_ip);
+static __be64 ext_pi_crc64(void *data, unsigned int len)
+{
+ return cpu_to_be64(crc64_rocksoft(data, len));
+}
+
+static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter,
+ enum t10_dif_type type)
+{
+ unsigned int i;
+
+ for (i = 0 ; i < iter->data_size ; i += iter->interval) {
+ struct crc64_pi_tuple *pi = iter->prot_buf;
+
+ pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval);
+ pi->app_tag = 0;
+
+ if (type == T10_PI_TYPE1_PROTECTION)
+ put_unaligned_be48(iter->seed, pi->ref_tag);
+ else
+ put_unaligned_be48(0ULL, pi->ref_tag);
+
+ iter->data_buf += iter->interval;
+ iter->prot_buf += iter->tuple_size;
+ iter->seed++;
+ }
+
+ return BLK_STS_OK;
+}
+
+static bool ext_pi_ref_escape(u8 *ref_tag)
+{
+ static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+ return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0;
+}
+
+static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
+ enum t10_dif_type type)
+{
+ unsigned int i;
+
+ for (i = 0; i < iter->data_size; i += iter->interval) {
+ struct crc64_pi_tuple *pi = iter->prot_buf;
+ u64 ref, seed;
+ __be64 csum;
+
+ if (type == T10_PI_TYPE1_PROTECTION) {
+ if (pi->app_tag == T10_PI_APP_ESCAPE)
+ goto next;
+
+ ref = get_unaligned_be48(pi->ref_tag);
+ seed = lower_48_bits(iter->seed);
+ if (ref != seed) {
+ pr_err("%s: ref tag error at location %llu (rcvd %llu)\n",
+ iter->disk_name, seed, ref);
+ return BLK_STS_PROTECTION;
+ }
+ } else if (type == T10_PI_TYPE3_PROTECTION) {
+ if (pi->app_tag == T10_PI_APP_ESCAPE &&
+ ext_pi_ref_escape(pi->ref_tag))
+ goto next;
+ }
+
+ csum = ext_pi_crc64(iter->data_buf, iter->interval);
+ if (pi->guard_tag != csum) {
+ pr_err("%s: guard tag error at sector %llu " \
+ "(rcvd %016llx, want %016llx)\n",
+ iter->disk_name, (unsigned long long)iter->seed,
+ be64_to_cpu(pi->guard_tag), be64_to_cpu(csum));
+ return BLK_STS_PROTECTION;
+ }
+
+next:
+ iter->data_buf += iter->interval;
+ iter->prot_buf += iter->tuple_size;
+ iter->seed++;
+ }
+
+ return BLK_STS_OK;
+}
+
+static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION);
+}
+
+static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION);
+}
+
+static void ext_pi_type1_prepare(struct request *rq)
+{
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u64 ref_tag = ext_pi_ref_tag(rq);
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u64 virt = lower_48_bits(bip_get_seed(bip));
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ /* Already remapped? */
+ if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+ break;
+
+ bip_for_each_vec(iv, bip, iter) {
+ unsigned int j;
+ void *p;
+
+ p = bvec_kmap_local(&iv);
+ for (j = 0; j < iv.bv_len; j += tuple_sz) {
+ struct crc64_pi_tuple *pi = p;
+ u64 ref = get_unaligned_be48(pi->ref_tag);
+
+ if (ref == virt)
+ put_unaligned_be48(ref_tag, pi->ref_tag);
+ virt++;
+ ref_tag++;
+ p += tuple_sz;
+ }
+ kunmap_local(p);
+ }
+
+ bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+ }
+}
+
+static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
+{
+ unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u64 ref_tag = ext_pi_ref_tag(rq);
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u64 virt = lower_48_bits(bip_get_seed(bip));
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ bip_for_each_vec(iv, bip, iter) {
+ unsigned int j;
+ void *p;
+
+ p = bvec_kmap_local(&iv);
+ for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
+ struct crc64_pi_tuple *pi = p;
+ u64 ref = get_unaligned_be48(pi->ref_tag);
+
+ if (ref == ref_tag)
+ put_unaligned_be48(virt, pi->ref_tag);
+ virt++;
+ ref_tag++;
+ intervals--;
+ p += tuple_sz;
+ }
+ kunmap_local(p);
+ }
+ }
+}
+
+static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION);
+}
+
+const struct blk_integrity_profile ext_pi_type1_crc64 = {
+ .name = "EXT-DIF-TYPE1-CRC64",
+ .generate_fn = ext_pi_type1_generate_crc64,
+ .verify_fn = ext_pi_type1_verify_crc64,
+ .prepare_fn = ext_pi_type1_prepare,
+ .complete_fn = ext_pi_type1_complete,
+};
+EXPORT_SYMBOL_GPL(ext_pi_type1_crc64);
+
+const struct blk_integrity_profile ext_pi_type3_crc64 = {
+ .name = "EXT-DIF-TYPE3-CRC64",
+ .generate_fn = ext_pi_type3_generate_crc64,
+ .verify_fn = ext_pi_type3_verify_crc64,
+ .prepare_fn = t10_pi_type3_prepare,
+ .complete_fn = t10_pi_type3_complete,
+};
+EXPORT_SYMBOL_GPL(ext_pi_type3_crc64);
+
+MODULE_LICENSE("GPL");
MODULE_LICENSE("GPL");