From c4e47bbb00dad9240f4c054859950e962042ecb8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 14:14:59 -0700 Subject: block: move cgroup time handling code into blk.h In preparation for moving time keeping into blk.h, move the cgroup related code for timestamps in here too. This will help avoid a circular dependency, and also moves it into a more appropriate header as this one is private to the block layer code. Leave struct bio_issue in blk_types.h as it's a proper time definition. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..1c07848dea7e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -206,52 +206,10 @@ static inline bool blk_path_error(blk_status_t error) return true; } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - struct bio_issue { u64 value; }; -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; -- cgit From da4c8c3d0975f031ef82d39927102e39fa6ddfac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Jan 2024 14:46:03 -0700 Subject: block: cache current nsec time in struct blk_plug Querying the current time is the most costly thing we do in the block layer per IO, and depending on kernel config settings, we may do it many times per IO. None of the callers actually need nsec granularity. Take advantage of that by caching the current time in the plug, with the assumption here being that any time checking will be temporally close enough that the slight loss of precision doesn't matter. If the block plug gets flushed, eg on preempt or schedule out, then we invalidate the cached clock. On a basic peak IOPS test case with iostats enabled, this changes the performance from: IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31 IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32 IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32 IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32 IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31 IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32 IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31 to IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32 IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31 IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31 IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32 IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31 IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31 IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32 which is more than a 9% improvement in performance. Looking at perf diff, we can see a huge reduction in time overhead: 10.55% -9.88% [kernel.vmlinux] [k] read_tsc 1.31% -1.22% [kernel.vmlinux] [k] ktime_get Note that since this relies on blk_plug for the caching, it's only applicable to the issue side. But this is where most of the time calls happen anyway. On the completion side, cached time stamping is done with struct io_comp patch, as long as the driver supports it. It's also worth noting that the above testing doesn't enable any of the higher cost CPU items on the block layer side, like wbt, cgroups, iocost, etc, which all would add additional time querying and hence overhead. IOW, results would likely look even better in comparison with those enabled, as distros would do. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99e4f5e72213..996d2ad756ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -942,6 +942,7 @@ struct blk_plug { /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; + u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; -- cgit From 06b23f92af87a84d70881b2ecaa72e00f7838264 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 09:18:39 -0700 Subject: block: update cached timestamp post schedule/preemption Mark the task as having a cached timestamp when set assign it, so we can efficiently check if it needs updating post being scheduled back in. This covers both the actual schedule out case, which would've flushed the plug, and the preemption case which doesn't touch the plugged requests (for many reasons, one of them being then we'd need to have preemption disabled around plug state manipulation). Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/sched.h | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 996d2ad756ff..d7cac3de65b3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -973,6 +973,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -996,6 +1008,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..15b7cb478d16 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1642,7 +1642,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ -- cgit From 71f4ecdbb42addf82b01b734b122a02707fed521 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Sun, 28 Jan 2024 23:52:20 -0800 Subject: block: remove gfp_flags from blkdev_zone_mgmt Now that all callers pass in GFP_KERNEL to blkdev_zone_mgmt() and use memalloc_no{io,fs}_{save,restore}() to define the allocation scope, we can drop the gfp_mask parameter from blkdev_zone_mgmt() as well as blkdev_zone_reset_all() and blkdev_zone_reset_all_emulated(). Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240128-zonefs_nofs-v3-5-ae3b7c8def61@wdc.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d7cac3de65b3..fac580976e3a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -325,7 +325,7 @@ void disk_set_zoned(struct gendisk *disk); int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); + sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)); -- cgit From 60d21aac52e26531affdadb7543fe5b93f58b450 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 1 Feb 2024 18:31:25 +0530 Subject: block: support PI at non-zero offset within metadata Block layer integrity processing assumes that protection information (PI) is placed in the first bytes of each metadata block. Remove this limitation and include the metadata before the PI in the calculation of the guard tag. Signed-off-by: Kanchan Joshi Signed-off-by: Chinmay Gameti Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240201130126.211402-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 1 + include/linux/blkdev.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 378b2459efe2..e253e7bd0d17 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -20,6 +20,7 @@ struct blk_integrity_iter { unsigned int data_size; unsigned short interval; unsigned char tuple_size; + unsigned char pi_offset; const char *disk_name; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fac580976e3a..0058783a4c43 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,6 +108,7 @@ struct blk_integrity { const struct blk_integrity_profile *profile; unsigned char flags; unsigned char tuple_size; + unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; }; -- cgit From 8c4955c069ea3b77dc63b55d13afa9341e894849 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:11 +0100 Subject: block: move max_{open,active}_zones to struct queue_limits The maximum number of open and active zones is a limit on the queue and should be places there so that we can including it in the upcoming queue limits batch update API. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0058783a4c43..251a11d2d2ae 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -190,8 +190,6 @@ struct gendisk { * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; - unsigned int max_open_zones; - unsigned int max_active_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; #endif /* CONFIG_BLK_DEV_ZONED */ @@ -308,6 +306,8 @@ struct queue_limits { unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; bool zoned; + unsigned int max_open_zones; + unsigned int max_active_zones; /* * Drivers that set dma_alignment to less than 511 must be prepared to @@ -640,23 +640,23 @@ static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { - disk->max_open_zones = max_open_zones; + disk->queue->limits.max_open_zones = max_open_zones; } static inline void disk_set_max_active_zones(struct gendisk *disk, unsigned int max_active_zones) { - disk->max_active_zones = max_active_zones; + disk->queue->limits.max_active_zones = max_active_zones; } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { - return bdev->bd_disk->max_open_zones; + return bdev->bd_disk->queue->limits.max_open_zones; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { - return bdev->bd_disk->max_active_zones; + return bdev->bd_disk->queue->limits.max_active_zones; } #else /* CONFIG_BLK_DEV_ZONED */ -- cgit From d690cb8ae14bd377d422b7905b6959c7e7a45b95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:14 +0100 Subject: block: add an API to atomically update queue limits Add a new queue_limits_{start,commit}_update pair of functions that allows taking an atomic snapshot of queue limits, update it, and commit it if it passes validity checking. Also use the low-level validation helper to implement blk_set_default_limits instead of duplicating the initialization. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 251a11d2d2ae..d41d7fe93457 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -474,6 +474,7 @@ struct request_queue { struct mutex sysfs_lock; struct mutex sysfs_dir_lock; + struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating @@ -862,6 +863,28 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, return chunk_sectors - (offset & (chunk_sectors - 1)); } +/** + * queue_limits_start_update - start an atomic update of queue limits + * @q: queue to update + * + * This functions starts an atomic update of the queue limits. It takes a lock + * to prevent other updates and returns a snapshot of the current limits that + * the caller can modify. The caller must call queue_limits_commit_update() + * to finish the update. + * + * Context: process context. The caller must have frozen the queue or ensured + * that there is outstanding I/O by other means. + */ +static inline struct queue_limits +queue_limits_start_update(struct request_queue *q) + __acquires(q->limits_lock) +{ + mutex_lock(&q->limits_lock); + return q->limits; +} +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim); + /* * Access functions for manipulating queue properties */ -- cgit From 4f563a64732dabb2677c7d1232a8f714a18b41b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:16 +0100 Subject: block: add a max_user_discard_sectors queue limit Add a new max_user_discard_sectors limit that mirrors max_user_sectors and stores the value that the user manually set. This now allows updates of the max_hw_discard_sectors to not worry about the user limit. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d41d7fe93457..45746ba73670 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -291,6 +291,7 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; -- cgit From 9ac4dd8c47d533eb420af6a679e66ec74771125c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:19 +0100 Subject: block: pass a queue_limits argument to blk_mq_init_queue Pass a queue_limits to blk_mq_init_queue and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also rename the function to blk_mq_alloc_queue as that is a much better name for a function that allocates a queue and always pass the queuedata argument instead of having a separate version for the extra argument. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..7d42c359e2ab 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -692,7 +692,8 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); -- cgit From 27e32cd23fed1ab88098897897dcb9ec2bdba4de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:20 +0100 Subject: block: pass a queue_limits argument to blk_mq_alloc_disk Pass a queue_limits to blk_mq_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7d42c359e2ab..390d35fa0032 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -682,13 +682,14 @@ enum { #define BLK_MQ_NO_HCTX_IDX (-1U) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); -#define blk_mq_alloc_disk(set, queuedata) \ +#define blk_mq_alloc_disk(set, lim, queuedata) \ ({ \ static struct lock_class_key __key; \ \ - __blk_mq_alloc_disk(set, queuedata, &__key); \ + __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -- cgit From 74fa8f9c553f7b5ccab7d103acae63cc2e080465 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 08:10:47 +0100 Subject: block: pass a queue_limits argument to blk_alloc_disk Pass a queue_limits to blk_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also change blk_alloc_disk to return an ERR_PTR instead of just NULL which can't distinguish errors. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20240215071055.2201424-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 45746ba73670..a14ea9344138 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -766,22 +766,26 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure + * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * + * Returns an ERR_PTR on error, else the allocated disk. + * * Context: can sleep */ -#define blk_alloc_disk(node_id) \ +#define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ - __blk_alloc_disk(node_id, &__key); \ + __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, -- cgit From b361c9027b4e4159e7bcca4eb64fd26507c19994 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Fri, 23 Feb 2024 15:57:48 +0000 Subject: sched: Add a new function to compare if two cpus have the same capacity The new helper function is needed to help blk-mq check if it needs to dispatch the softirq on another CPU to match the performance level the IO requester is running at. This is important on HMP systems where not all CPUs have the same compute capacity. Signed-off-by: Qais Yousef Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240223155749.2958009-2-qyousef@layalina.io Signed-off-by: Jens Axboe --- include/linux/sched/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d7..11e0e00e0bb9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -176,6 +176,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); +bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); @@ -226,6 +227,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], { } +static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + return true; +} + static inline bool cpus_share_cache(int this_cpu, int that_cpu) { return true; -- cgit From 13fe8e6825e44129b6cbeee41d3012554bf8d687 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Feb 2024 15:55:39 +0800 Subject: ublk: add UBLK_CMD_DEL_DEV_ASYNC The current command UBLK_CMD_DEL_DEV won't return until the device is released, this way looks more reliable, but makes userspace more difficult to implement, especially about orders: unmap command buffer(which holds one ublkc reference), ublkc close, io_uring_file_unregister, ublkb close. Add UBLK_CMD_DEL_DEV_ASYNC so that device deletion won't wait release, then userspace needn't worry about the above order. Actually both loop and nbd is deleted in this async way. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240223075539.89945-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index b9cfc5c96268..c8dc5f8ea699 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -49,6 +49,8 @@ _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_GET_FEATURES \ _IOR('u', 0x13, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV_ASYNC \ + _IOR('u', 0x14, struct ublksrv_ctrl_cmd) /* * 64bits are enough now, and it should be easy to extend in case of -- cgit From 631d4efb8009df64deae8c1382b8cf43879a4e22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:40 -0800 Subject: block: add a queue_limits_set helper Add a small wrapper around queue_limits_commit_update for stacking drivers that don't want to update existing limits, but set an entirely new set. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a14ea9344138..dd510ad7ce4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -889,6 +889,7 @@ queue_limits_start_update(struct request_queue *q) } int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); +int queue_limits_set(struct request_queue *q, struct queue_limits *lim); /* * Access functions for manipulating queue properties -- cgit From c1373f1cf452e4c7553a9d3bc05d87ec15c4f85f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:41 -0800 Subject: block: add a queue_limits_stack_bdev helper Add a small wrapper around blk_stack_limits that allows passing a bdev for the bottom device and prints an error in case of misaligned device. The name fits into the new queue limits API and the intent is to eventually replace disk_stack_limits. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dd510ad7ce4b..285e82723d64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -924,6 +924,8 @@ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -- cgit From 25802f3ab7a589d2b5d9e1266acffad263d9893e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:25 +0200 Subject: nvme-rdma: move NVME_RDMA_IP_PORT from common file The correct place for this definition is the nvme rdma header file and not the common nvme header file. Reviewed-by: Christoph Hellwig Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 2 ++ include/linux/nvme.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 4dd7e6fe92fb..146dd2223a5f 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -6,6 +6,8 @@ #ifndef _LINUX_NVME_RDMA_H #define _LINUX_NVME_RDMA_H +#define NVME_RDMA_IP_PORT 4420 + #define NVME_RDMA_MAX_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc605ec4a3fd..ce0c1143c7e4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -23,8 +23,6 @@ #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" -#define NVME_RDMA_IP_PORT 4420 - #define NVME_NSID_ALL 0xffffffff enum nvme_subsys_type { -- cgit From 36144964062b8676ee64281852de2a2c1b193aca Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:29 +0200 Subject: nvme-rdma: introduce NVME_RDMA_MAX_METADATA_QUEUE_SIZE definition This definition will be used by controllers that are configured with metadata support. For now, both regular and metadata controllers have the same maximal queue size but later commit will increase the maximal queue size for regular RDMA controllers to 256. We'll keep the maximal queue size for metadata controllers to be 128 since there are more resources that are needed for metadata operations and 128 is the optimal size found for metadata controllers base on testing. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 146dd2223a5f..d0b9941911a1 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,7 +8,8 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, -- cgit From f096ba3286f5e773c496cf81667d01f2e8a2a37b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:32 +0200 Subject: nvmet-rdma: set max_queue_size for RDMA transport A new port configuration was added to set max_queue_size. Clamp user configuration to RDMA transport limits. Increase the maximal queue size of RDMA controllers from 128 to 256 (the default size stays 128 same as before). Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index d0b9941911a1..eb2f04d636c8 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,8 +8,9 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 256 #define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 +#define NVME_RDMA_DEFAULT_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, -- cgit From f8c7511db009d42e2c24e48eeb04e3f1b67ab209 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 16:32:16 -0300 Subject: block: make block_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the block_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240305-class_cleanup-block-v1-1-130bb27b9c72@marliere.net Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..19c7596f4ebf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; -extern struct class block_class; +extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. -- cgit From dd27a84b06aa9ea6a94b0f3e59dc768f981962e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2024 07:01:50 -0700 Subject: block: remove disk_stack_limits disk_stack_limits is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed--by: Song Liu Tested-by: Song Liu Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240303140150.5435-12-hch@lst.de --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..75c909865a8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -926,8 +926,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); -extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); -- cgit