From a91b2014fc31dc6eaa02ca33aa3b4d1b6e4a0207 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:53:34 +0200 Subject: bcache: remove a duplicate ->make_request_fn assignment The make_request_fn pointer should only be assigned by blk_alloc_queue. Fix a left over manual initialization. Fixes: ff27668ce809 ("bcache: pass the make_request methods to blk_queue_make_request") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 71a90fbec314..77d1a2697517 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1372,7 +1372,6 @@ void bch_flash_dev_request_init(struct bcache_device *d) { struct gendisk *g = d->disk; - g->queue->make_request_fn = flash_dev_make_request; g->queue->backing_dev_info->congested_fn = flash_dev_congested; d->cache_miss = flash_dev_cache_miss; d->ioctl = flash_dev_ioctl; -- cgit From ae3cc8d8ff061d3ffca96665685550e70a86472a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:53:35 +0200 Subject: dm: remove the make_request_fn check in device_area_is_invalid Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0a2cc197f62b..8277b959e00b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -279,7 +279,6 @@ static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q; struct queue_limits *limits = data; struct block_device *bdev = dev->bdev; sector_t dev_size = @@ -288,22 +287,6 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, limits->logical_block_size >> SECTOR_SHIFT; char b[BDEVNAME_SIZE]; - /* - * Some devices exist without request functions, - * such as loop devices not yet bound to backing files. - * Forbid the use of such devices. - */ - q = bdev_get_queue(bdev); - if (!q || !q->make_request_fn) { - DMWARN("%s: %s is not yet initialised: " - "start=%llu, len=%llu, dev_size=%llu", - dm_device_name(ti->table->md), bdevname(bdev, b), - (unsigned long long)start, - (unsigned long long)len, - (unsigned long long)dev_size); - return 1; - } - if (!dev_size) return 0; -- cgit From 8cf7961dab42c9177a556b719c15f5b9449c24d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:53:36 +0200 Subject: block: bypass ->make_request_fn for blk-mq drivers Call blk_mq_make_request when no ->make_request_fn is set. This is safe now that blk_alloc_queue always sets up the pointer for make_request based drivers. This avoids an indirect call in the blk-mq driver I/O fast path, which is rather expensive due to spectre mitigations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index db9e46114653..0eb93da44ea2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1788,6 +1788,9 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) int srcu_idx; struct dm_table *map; + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) + return blk_mq_make_request(q, bio); + map = dm_get_live_table(md, &srcu_idx); /* if we're suspended, we have to queue this io for later */ -- cgit From a892c8d52c02284076fbbacae6692aa5c5807d11 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:18 +0000 Subject: block: Inline encryption support for blk-mq We must have some way of letting a storage device driver know what encryption context it should use for en/decrypting a request. However, it's the upper layers (like the filesystem/fscrypt) that know about and manages encryption contexts. As such, when the upper layer submits a bio to the block layer, and this bio eventually reaches a device driver with support for inline encryption, the device driver will need to have been told the encryption context for that bio. We want to communicate the encryption context from the upper layer to the storage device along with the bio, when the bio is submitted to the block layer. To do this, we add a struct bio_crypt_ctx to struct bio, which can represent an encryption context (note that we can't use the bi_private field in struct bio to do this because that field does not function to pass information across layers in the storage stack). We also introduce various functions to manipulate the bio_crypt_ctx and make the bio/request merging logic aware of the bio_crypt_ctx. We also make changes to blk-mq to make it handle bios with encryption contexts. blk-mq can merge many bios into the same request. These bios need to have contiguous data unit numbers (the necessary changes to blk-merge are also made to ensure this) - as such, it suffices to keep the data unit number of just the first bio, since that's all a storage driver needs to infer the data unit number to use for each data block in each bio in a request. blk-mq keeps track of the encryption context to be used for all the bios in a request with the request's rq_crypt_ctx. When the first bio is added to an empty request, blk-mq will program the encryption context of that bio into the request_queue's keyslot manager, and store the returned keyslot in the request's rq_crypt_ctx. All the functions to operate on encryption contexts are in blk-crypto.c. Upper layers only need to call bio_crypt_set_ctx with the encryption key, algorithm and data_unit_num; they don't have to worry about getting a keyslot for each encryption context, as blk-mq/blk-crypto handles that. Blk-crypto also makes it possible for request-based layered devices like dm-rq to make use of inline encryption hardware by cloning the rq_crypt_ctx and programming a keyslot in the new request_queue when necessary. Note that any user of the block layer can submit bios with an encryption context, such as filesystems, device-mapper targets, etc. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0eb93da44ea2..8921cd79422c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -26,6 +26,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" @@ -1334,6 +1335,8 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, __bio_clone_fast(clone, bio); + bio_crypt_clone(clone, bio, GFP_NOIO); + if (bio_integrity(bio)) { int r; -- cgit From ac7c5675fa45a372fab27d78a72d2e10e4734959 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 16 May 2020 20:28:01 +0200 Subject: blk-mq: allow blk_mq_make_request to consume the q_usage_counter reference blk_mq_make_request currently needs to grab an q_usage_counter reference when allocating a request. This is because the block layer grabs one before calling blk_mq_make_request, but also releases it as soon as blk_mq_make_request returns. Remove the blk_queue_exit call after blk_mq_make_request returns, and instead let it consume the reference. This works perfectly fine for the block layer caller, just device mapper needs an extra reference as the old problem still persists there. Open code blk_queue_enter_live in device mapper, as there should be no other callers and this allows better documenting why we do a non-try get. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8921cd79422c..f215b8666448 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1791,8 +1791,17 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) int srcu_idx; struct dm_table *map; - if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) { + /* + * We are called with a live reference on q_usage_counter, but + * that one will be released as soon as we return. Grab an + * extra one as blk_mq_make_request expects to be able to + * consume a reference (which lives until the request is freed + * in case a request is allocated). + */ + percpu_ref_get(&q->q_usage_counter); return blk_mq_make_request(q, bio); + } map = dm_get_live_table(md, &srcu_idx); -- cgit From 9398554fb3979852512ff4f1405e759889b45c16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 May 2020 14:36:00 +0200 Subject: block: remove the error_sector argument to blkdev_issue_flush The argument isn't used by any caller, and drivers don't fill out bi_sector for flush requests either. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-zoned-metadata.c | 6 +++--- drivers/md/raid5-ppl.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4094c47eca7f..84cb04904fab 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2657,7 +2657,7 @@ static void bitmap_flush_work(struct work_struct *work) dm_integrity_flush_buffers(ic); if (ic->meta_dev) - blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL); + blkdev_issue_flush(ic->dev->bdev, GFP_NOIO); limit = ic->provided_data_sectors; if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 369de15c4e80..bf2245370305 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -661,7 +661,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); return ret; } @@ -703,7 +703,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); return ret; } @@ -772,7 +772,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); goto err; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index d50238d0a85d..a750f4bbb5d9 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, } /* flush the disk cache after recovery if necessary */ - ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL); out: __free_page(page); return ret; -- cgit From 85750aeb748fd17a393d1f510f7b33e3336c395e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:08 +0200 Subject: bcache: use bio_{start,end}_io_acct Switch bcache to use the nicer bio accounting helpers, and call the routines where we also sample the start time to give coherent accounting results. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 77d1a2697517..22b483527176 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -668,9 +668,7 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio), - &s->d->disk->part0, s->start_time); - + bio_end_io_acct(s->orig_bio, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); @@ -730,7 +728,7 @@ static inline struct search *search_alloc(struct bio *bio, s->recoverable = 1; s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; - s->start_time = jiffies; + s->start_time = bio_start_io_acct(bio); s->iop.c = d->c; s->iop.bio = NULL; @@ -1082,8 +1080,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - generic_end_io_acct(ddip->d->disk->queue, bio_op(bio), - &ddip->d->disk->part0, ddip->start_time); + bio_end_io_acct(bio, ddip->start_time); if (bio->bi_status) { struct cached_dev *dc = container_of(ddip->d, @@ -1108,7 +1105,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) */ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ddip->d = d; - ddip->start_time = jiffies; + ddip->start_time = bio_start_io_acct(bio); ddip->bi_end_io = bio->bi_end_io; ddip->bi_private = bio->bi_private; bio->bi_end_io = detached_dev_end_io; @@ -1190,11 +1187,6 @@ blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) } } - generic_start_io_acct(q, - bio_op(bio), - bio_sectors(bio), - &d->disk->part0); - bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; @@ -1311,8 +1303,6 @@ blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); - s = search_alloc(bio, d); cl = &s->cl; bio = &s->bio.bio; -- cgit From 86240d5b6813d6855845959b133d5b4a95c60f92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:09 +0200 Subject: dm: use bio_{start,end}_io_acct Switch dm to use the nicer bio accounting helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- drivers/md/dm.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f215b8666448..3f39fa1ac756 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -681,11 +681,7 @@ static void start_io_acct(struct dm_io *io) struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - io->start_time = jiffies; - - generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), - &dm_disk(md)->part0); - + io->start_time = bio_start_io_acct(bio); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), @@ -698,8 +694,7 @@ static void end_io_acct(struct dm_io *io) struct bio *bio = io->orig_bio; unsigned long duration = jiffies - io->start_time; - generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, - io->start_time); + bio_end_io_acct(bio, io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), -- cgit From bf0beec0607db3c6f6fb7bd2c6d503792b05cf3f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 29 May 2020 15:53:15 +0200 Subject: blk-mq: drain I/O when all CPUs in a hctx are offline Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup up queue mapping. Thomas mentioned the following point[1]: "That was the constraint of managed interrupts from the very beginning: The driver/subsystem has to quiesce the interrupt line and the associated queue _before_ it gets shutdown in CPU unplug and not fiddle with it until it's restarted by the core when the CPU is plugged in again." However, current blk-mq implementation doesn't quiesce hw queue before the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is a cpuhp state handled after the CPU is down, so there isn't any chance to quiesce the hctx before shutting down the CPU. Add new CPUHP_AP_BLK_MQ_ONLINE state to stop allocating from blk-mq hctxs where the last CPU goes away, and wait for completion of in-flight requests. This guarantees that there is no inflight I/O before shutting down the managed IRQ. Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need to wait for completion of in-flight requests from these drivers to avoid a potential dead-lock. It is safe to do this for stacking drivers as those do not use interrupts at all and their I/O completions are triggered by underlying devices I/O completion. [1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/ [hch: different retry mechanism, merged two patches, minor cleanups] Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- drivers/md/dm-rq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 3f8577e2c13b..f60c02512121 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; -- cgit