From ab6c340eeac426fb649ddb4f23b7c752f0092204 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:36 +0200 Subject: block: use memzero_page in zero_fill_bio Use memzero_bvec to zero each segment in the bio instead of manually mapping and zeroing the data. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 1fab762e079b..2e436bccb1e2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -495,16 +495,11 @@ EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio(struct bio *bio) { - unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } + bio_for_each_segment(bv, bio, iter) + memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio); -- cgit From f8b679a070c536600c64a78c83b96aa617f8fa71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:41 +0200 Subject: block: rewrite bio_copy_data_iter to use bvec_kmap_local and memcpy_to_bvec Use the proper helpers instead of open coding the copy. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-11-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 2e436bccb1e2..0c89fa2f7a85 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1186,27 +1186,15 @@ EXPORT_SYMBOL(bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); - - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf; + + src_buf = bvec_kmap_local(&src_bv); + memcpy_to_bvec(&dst_bv, src_buf); + kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); -- cgit From f434cdc78e01e40fcfb8ef7e6752e3e405b84b58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:42 +0200 Subject: block: use memcpy_to_bvec in copy_to_high_bio_irq Use memcpy_to_bvec instead of opencoding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-12-hch@lst.de Signed-off-by: Jens Axboe --- block/bounce.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/bounce.c b/block/bounce.c index 94081e013c58..7e9e666c04dc 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -67,18 +67,6 @@ static __init int init_emergency_pool(void) __initcall(init_emergency_pool); -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned char *vto; - - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); -} - /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it @@ -86,7 +74,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) */ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { - unsigned char *vfrom; struct bio_vec tovec, fromvec; struct bvec_iter iter; /* @@ -104,11 +91,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec.bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); + memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + + tovec.bv_offset); } bio_advance_iter(from, &from_iter, tovec.bv_len); } -- cgit From d24920e20ca66780d4059e2ece9f858cbae02310 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:43 +0200 Subject: block: use memcpy_from_bvec in bio_copy_kern_endio_read Use memcpy_from_bvec instead of open coding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 3743158ddaeb..d1448aaad980 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -400,7 +400,7 @@ static void bio_copy_kern_endio_read(struct bio *bio) struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + memcpy_from_bvec(p, bvec); p += bvec->bv_len; } -- cgit From 4aebe8596ab77b0b7125e3584ed0259c4657a06d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:44 +0200 Subject: block: use memcpy_from_bvec in __blk_queue_bounce Rewrite the actual bounce buffering loop in __blk_queue_bounce to that the memcpy_to_bvec helper can be used to perform the data copies. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-14-hch@lst.de Signed-off-by: Jens Axboe --- block/bounce.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/bounce.c b/block/bounce.c index 7e9e666c04dc..05fc7148489d 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -239,24 +239,19 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * because the 'bio' is single-page bvec. */ for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *page = to->bv_page; + struct page *bounce_page; - if (!PageHighMem(page)) + if (!PageHighMem(to->bv_page)) continue; - to->bv_page = mempool_alloc(&page_pool, GFP_NOIO); - inc_zone_page_state(to->bv_page, NR_BOUNCE); + bounce_page = mempool_alloc(&page_pool, GFP_NOIO); + inc_zone_page_state(bounce_page, NR_BOUNCE); if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); + flush_dcache_page(to->bv_page); + memcpy_from_bvec(page_address(bounce_page), to); } + to->bv_page = bounce_page; } trace_block_bio_bounce(*bio_orig); -- cgit From 8aec120a9ca80c14ce002505cea1e1639f8e9ea5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:45 +0200 Subject: block: use bvec_kmap_local in t10_pi_type1_{prepare,complete} Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-15-hch@lst.de Signed-off-by: Jens Axboe --- block/t10-pi.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/t10-pi.c b/block/t10-pi.c index d910534b3a41..00c203b2a921 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -147,11 +147,10 @@ static void t10_pi_type1_prepare(struct request *rq) break; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -161,8 +160,7 @@ static void t10_pi_type1_prepare(struct request *rq) ref_tag++; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } bip->bip_flags |= BIP_MAPPED_INTEGRITY; @@ -195,11 +193,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) struct bvec_iter iter; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -210,8 +207,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) intervals--; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } } } -- cgit From 503469b5b30f76169c6302d1469e69a2fb67faf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:46 +0200 Subject: block: use bvec_kmap_local in bio_integrity_process Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-16-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4b4eb8964a6f..8f54d49dc500 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -172,18 +172,16 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.prot_buf = prot_buf; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = kmap_atomic(bv.bv_page); + void *kaddr = bvec_kmap_local(&bv); - iter.data_buf = kaddr + bv.bv_offset; + iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } + kunmap_local(kaddr); + + if (ret) + break; - kunmap_atomic(kaddr); } return ret; } -- cgit From a45e43cad798173b41e0d6f119784826d3ead02c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:55 +0200 Subject: block: assert the locking state in delete_partition Add a lockdep assert instead of the outdated locking comment. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-3-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 4230d4f71879..9902b1635b7d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -281,12 +281,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -/* - * Must be called either with open_mutex held, before a disk can be opened or - * after all disk users are gone. - */ static void delete_partition(struct block_device *part) { + lockdep_assert_held(&part->bd_disk->open_mutex); + fsync_bdev(part); __invalidate_device(part, true); -- cgit From d7a66574b34e0b354442140927f9b787efccabfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:56 +0200 Subject: block: unhash the whole device inode earlier Unhash the whole device inode early in del_gendisk. This allows to remove the first GENHD_FL_UP check in the open path as we simply won't find a just removed inode. The second non-racy check after taking open_mutex is still kept. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210722075402.983367-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 298ee78c1bda..716f5ca479ad 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,6 +585,7 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); mutex_lock(&disk->open_mutex); + remove_inode_hash(disk->part0->bd_inode); disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); @@ -592,12 +593,6 @@ void del_gendisk(struct gendisk *disk) fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); - /* - * Unhash the bdev inode for this device so that it can't be looked - * up any more even if openers still hold references to it. - */ - remove_inode_hash(disk->part0->bd_inode); - set_capacity(disk, 0); if (!(disk->flags & GENHD_FL_HIDDEN)) { -- cgit From 0468c5323413c6903e4cbcef841a55e6c5578cd2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:57 +0200 Subject: block: allocate bd_meta_info later in add_partitions Move the allocation of bd_meta_info after initializing the struct device to avoid the special bdput error handling path. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210722075402.983367-5-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 9902b1635b7d..09c58a110a89 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -356,13 +356,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - if (info) { - err = -ENOMEM; - bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); - if (!bdev->bd_meta_info) - goto out_bdput; - } - pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) @@ -386,6 +379,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } pdev->devt = devt; + if (info) { + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) + goto out_put; + } + /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); @@ -415,9 +415,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; -out_bdput: - bdput(bdev); - return ERR_PTR(err); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); -- cgit From 9d3b8813895d737fcef4ec8df518f67e5cc381b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:58 +0200 Subject: block: change the refcounting for partitions Instead of acquiring an inode reference on open make sure partitions always hold device model references to the disk while alive, and switch open to grab only a device model reference to the opened block device. If that is a partition the disk reference is transitively held by the partition already. Link: https://lore.kernel.org/r/20210722075402.983367-6-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 09c58a110a89..4f7a1a9cd544 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -261,6 +261,7 @@ static void part_release(struct device *dev) { if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); + put_disk(dev_to_bdev(dev)->bd_disk); bdput(dev_to_bdev(dev)); } @@ -349,9 +350,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); + /* ensure we always have a reference to the whole disk */ + get_device(disk_to_dev(disk)); + + err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) - return ERR_PTR(-ENOMEM); + goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); @@ -420,6 +425,8 @@ out_del: device_del(pdev); out_put: put_device(pdev); +out_put_disk: + put_disk(disk); return ERR_PTR(err); } -- cgit From 2f4731dcd0bb73379fbb9e3eb07ae7324125caef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:54:02 +0200 Subject: block: remove bdput Now that we've stopped using inode references for anything meaninful in the block layer get rid of the helper to put it and just open code the call to iput on the block_device inode. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- block/partitions/core.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 716f5ca479ad..5dbb99b57b33 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1076,7 +1076,7 @@ static void disk_release(struct device *dev) xa_destroy(&disk->part_tbl); if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); - bdput(disk->part0); /* frees the disk */ + iput(disk->part0->bd_inode); /* frees the disk */ } struct class block_class = { .name = "block", @@ -1261,7 +1261,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) out_destroy_part_tbl: xa_destroy(&disk->part_tbl); - bdput(disk->part0); + iput(disk->part0->bd_inode); out_free_disk: kfree(disk); return NULL; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4f7a1a9cd544..2415bffc2771 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -262,7 +262,7 @@ static void part_release(struct device *dev) if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); put_disk(dev_to_bdev(dev)->bd_disk); - bdput(dev_to_bdev(dev)); + iput(dev_to_bdev(dev)->bd_inode); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) -- cgit From 26e2d7a362f6a83146ea3eaa8f17ca9ce35388d3 Mon Sep 17 00:00:00 2001 From: Abd-Alrhman Masalkhi Date: Tue, 27 Jul 2021 08:25:13 +0200 Subject: block: reduce stack usage in diskstats_show MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have compiled the kernel with a cross compiler "hppa-linux-gnu-" v9.3.0 on x86-64 host machine. I got the following warning: block/genhd.c: In function ‘diskstats_show’: block/genhd.c:1227:1: warning: the frame size of 1688 bytes is larger than 1280 bytes [-Wframe-larger-than=] 1227 | } By Reduced the stack footprint by using the %pg printk specifier instead of disk_name to remove the need for the on-stack buffer. Signed-off-by: Abd-Alrhman Masalkhi Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 5dbb99b57b33..cf705cf95440 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1111,7 +1111,6 @@ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; - char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; unsigned long idx; @@ -1134,15 +1133,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) else inflight = part_in_flight(hd); - seq_printf(seqf, "%4d %7d %s " + seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(hd->bd_dev), MINOR(hd->bd_dev), - disk_name(gp, hd->bd_partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], -- cgit From a9e7bc3de4051d037a8e6f2d30448c347263737e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:14 +0200 Subject: block: use the %pg format specifier in printk_all_partitions Simplify printing the partition name by using the %pg format specifier that is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index cf705cf95440..770f21b4fd1a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -678,7 +678,6 @@ void __init printk_all_partitions(void) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); struct block_device *part; - char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; unsigned long idx; @@ -698,11 +697,10 @@ void __init printk_all_partitions(void) xa_for_each(&disk->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - printk("%s%s %10llu %s %s", + printk("%s%s %10llu %pg %s", bdev_is_partition(part) ? " " : "", bdevt_str(part->bd_dev, devt_buf), - bdev_nr_sectors(part) >> 1, - disk_name(disk, part->bd_partno, name_buf), + bdev_nr_sectors(part) >> 1, part, part->bd_meta_info ? part->bd_meta_info->uuid : ""); if (bdev_is_partition(part)) -- cgit From a291bb43e5c9fdedc4be3dfd496e64e7c5a78b1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:15 +0200 Subject: block: use the %pg format specifier in show_partition Simplify printing the partition name by using the %pg format specifier that is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 770f21b4fd1a..6ed58fda2c05 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -778,7 +778,6 @@ static int show_partition(struct seq_file *seqf, void *v) struct gendisk *sgp = v; struct block_device *part; unsigned long idx; - char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ if (!get_capacity(sgp) || (!disk_max_parts(sgp) && @@ -791,10 +790,9 @@ static int show_partition(struct seq_file *seqf, void *v) xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - seq_printf(seqf, "%4d %7d %10llu %s\n", + seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), - bdev_nr_sectors(part) >> 1, - disk_name(sgp, part->bd_partno, buf)); + bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; -- cgit From 453b8ab696b32cfd8bad80a5501937440d1cf214 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:16 +0200 Subject: block: simplify printing the device names disk_stack_limits Printk ->disk_name directly for the disk and use the %pg format specifier for the block device, which is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 902c40d67120..109012719aa0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -661,15 +661,9 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) { - char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; - - disk_name(disk, 0, top); - bdevname(bdev, bottom); - - printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", - top, bottom); - } + get_start_sect(bdev) + (offset >> 9)) < 0) + pr_notice("%s: Warning: Device %pg is misaligned\n", + disk->disk_name, bdev); blk_queue_update_readahead(disk->queue); } -- cgit From 1d7035478f64c040441c9cb2aa32e0d7fae526d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:17 +0200 Subject: block: simplify disk name formatting in check_partition disk_name for partition 0 just copies out the disk_name field. Replace the call to disk_name with a %s format specifier. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-6-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 2415bffc2771..fb3a556cacce 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -136,7 +136,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) state->pp_buf[0] = '\0'; state->bdev = hd->part0; - disk_name(hd, 0, state->name); + snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); -- cgit From abd2864a3e46368a58f3718491521779099bfc14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:18 +0200 Subject: block: remove disk_name() Remove the disk_name function now that all users are gone. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 1 - block/genhd.c | 17 +++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 4b885c0f6708..56f33fbcde59 100644 --- a/block/blk.h +++ b/block/blk.h @@ -344,7 +344,6 @@ static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); -char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 diff --git a/block/genhd.c b/block/genhd.c index 6ed58fda2c05..38f053074159 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -78,11 +78,17 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* - * Format the device name of the indicated disk into the supplied buffer and - * return a pointer to that same buffer for convenience. + * Format the device name of the indicated block device into the supplied buffer + * and return a pointer to that same buffer for convenience. + * + * Note: do not use this in new code, use the %pg specifier to sprintf and + * printk insted. */ -char *disk_name(struct gendisk *hd, int partno, char *buf) +const char *bdevname(struct block_device *bdev, char *buf) { + struct gendisk *hd = bdev->bd_disk; + int partno = bdev->bd_partno; + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) @@ -92,11 +98,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf) return buf; } - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_partno, buf); -} EXPORT_SYMBOL(bdevname); static void part_stat_read_all(struct block_device *part, -- cgit From 2164877c7f373e14e55fca20b7c4a9c436fe4462 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jul 2021 07:37:56 +0200 Subject: block: remove cmdline-parser.c cmdline-parser.c is only used by the cmdline faux partition format, so merge the code into that and avoid an indirect call. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210728053756.409654-1-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 10 -- block/Makefile | 1 - block/cmdline-parser.c | 255 ------------------------------------------- block/partitions/Kconfig | 1 - block/partitions/cmdline.c | 267 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 262 insertions(+), 272 deletions(-) delete mode 100644 block/cmdline-parser.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index fd732aede922..15dfb7660645 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -114,16 +114,6 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. -config BLK_CMDLINE_PARSER - bool "Block device command line partition parser" - help - Enabling this option allows you to specify the partition layout from - the kernel boot args. This is typically of use for embedded devices - which don't otherwise have any standardized method for listing the - partitions on a block device. - - See Documentation/block/cmdline-partition.rst for more information. - config BLK_WBT bool "Enable support for block device writeback throttling" help diff --git a/block/Makefile b/block/Makefile index bfbe4e13ca1e..c72592b4cf31 100644 --- a/block/Makefile +++ b/block/Makefile @@ -28,7 +28,6 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c deleted file mode 100644 index f2a14571882b..000000000000 --- a/block/cmdline-parser.c +++ /dev/null @@ -1,255 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Parse command line, get partition information - * - * Written by Cai Zhiyong - * - */ -#include -#include - -static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) -{ - int ret = 0; - struct cmdline_subpart *new_subpart; - - *subpart = NULL; - - new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); - if (!new_subpart) - return -ENOMEM; - - if (*partdef == '-') { - new_subpart->size = (sector_t)(~0ULL); - partdef++; - } else { - new_subpart->size = (sector_t)memparse(partdef, &partdef); - if (new_subpart->size < (sector_t)PAGE_SIZE) { - pr_warn("cmdline partition size is invalid."); - ret = -EINVAL; - goto fail; - } - } - - if (*partdef == '@') { - partdef++; - new_subpart->from = (sector_t)memparse(partdef, &partdef); - } else { - new_subpart->from = (sector_t)(~0ULL); - } - - if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); - - if (!next) { - pr_warn("cmdline partition format is invalid."); - ret = -EINVAL; - goto fail; - } - - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; - - partdef = ++next; - } else - new_subpart->name[0] = '\0'; - - new_subpart->flags = 0; - - if (!strncmp(partdef, "ro", 2)) { - new_subpart->flags |= PF_RDONLY; - partdef += 2; - } - - if (!strncmp(partdef, "lk", 2)) { - new_subpart->flags |= PF_POWERUP_LOCK; - partdef += 2; - } - - *subpart = new_subpart; - return 0; -fail: - kfree(new_subpart); - return ret; -} - -static void free_subpart(struct cmdline_parts *parts) -{ - struct cmdline_subpart *subpart; - - while (parts->subpart) { - subpart = parts->subpart; - parts->subpart = subpart->next_subpart; - kfree(subpart); - } -} - -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) -{ - int ret = -EINVAL; - char *next; - int length; - struct cmdline_subpart **next_subpart; - struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; - - *parts = NULL; - - newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); - if (!newparts) - return -ENOMEM; - - next = strchr(bdevdef, ':'); - if (!next) { - pr_warn("cmdline partition has no block device."); - goto fail; - } - - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; - newparts->nr_subparts = 0; - - next_subpart = &newparts->subpart; - - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strncpy(buf, bdevdef, length); - buf[length] = '\0'; - - ret = parse_subpart(next_subpart, buf); - if (ret) - goto fail; - - newparts->nr_subparts++; - next_subpart = &(*next_subpart)->next_subpart; - } - - if (!newparts->subpart) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - *parts = newparts; - - return 0; -fail: - free_subpart(newparts); - kfree(newparts); - return ret; -} - -void cmdline_parts_free(struct cmdline_parts **parts) -{ - struct cmdline_parts *next_parts; - - while (*parts) { - next_parts = (*parts)->next_parts; - free_subpart(*parts); - kfree(*parts); - *parts = next_parts; - } -} -EXPORT_SYMBOL(cmdline_parts_free); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) -{ - int ret; - char *buf; - char *pbuf; - char *next; - struct cmdline_parts **next_parts; - - *parts = NULL; - - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - next_parts = parts; - - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); - if (ret) - goto fail; - - if (next) - pbuf = ++next; - - next_parts = &(*next_parts)->next_parts; - } - - if (!*parts) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - ret = 0; -done: - kfree(buf); - return ret; - -fail: - cmdline_parts_free(parts); - goto done; -} -EXPORT_SYMBOL(cmdline_parts_parse); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev) -{ - while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) - parts = parts->next_parts; - return parts; -} -EXPORT_SYMBOL(cmdline_parts_find); - -/* - * add_part() - * 0 success. - * 1 can not add so many partitions. - */ -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param) -{ - sector_t from = 0; - struct cmdline_subpart *subpart; - - for (subpart = parts->subpart; subpart; - subpart = subpart->next_subpart, slot++) { - if (subpart->from == (sector_t)(~0ULL)) - subpart->from = from; - else - from = subpart->from; - - if (from >= disk_size) - break; - - if (subpart->size > (disk_size - from)) - subpart->size = disk_size - from; - - from += subpart->size; - - if (add_part(slot, subpart, param)) - break; - } - - return slot; -} -EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 6e2a649669e5..278593b8e4e9 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -264,7 +264,6 @@ config SYSV68_PARTITION config CMDLINE_PARTITION bool "Command line partition support" if PARTITION_ADVANCED - select BLK_CMDLINE_PARSER help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 8f545c36cde4..482a29e95dbd 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -14,20 +14,248 @@ * For further information, see "Documentation/block/cmdline-partition.rst" * */ +#include +#include +#include +#include "check.h" -#include -#include "check.h" +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +static void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +static int cmdline_parts_parse(struct cmdline_parts **parts, + const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} static char *cmdline; static struct cmdline_parts *bdev_parts; -static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +static int add_part(int slot, struct cmdline_subpart *subpart, + struct parsed_partitions *state) { int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; - struct parsed_partitions *state = (struct parsed_partitions *)param; if (slot >= state->limit) return 1; @@ -50,6 +278,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param) return 0; } +static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + struct parsed_partitions *state) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + int slot = 1; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, state)) + break; + } + + return slot; +} + static int __init cmdline_parts_setup(char *s) { cmdline = s; @@ -147,7 +404,7 @@ int cmdline_partition(struct parsed_partitions *state) disk_size = get_capacity(state->bdev->bd_disk) << 9; - cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); -- cgit From cf179948554a2e0d2b622317bf6bf33138ac36e5 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:25 +0200 Subject: block: add disk sequence number Associating uevents with block devices in userspace is difficult and racy: the uevent netlink socket is lossy, and on slow and overloaded systems has a very high latency. Block devices do not have exclusive owners in userspace, any process can set one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 can be reused again and again). A userspace process setting up a block device and watching for its events cannot thus reliably tell whether an event relates to the device it just set up or another earlier instance with the same name. Being able to set a UUID on a loop device would solve the race conditions. But it does not allow to derive orderings from uevents: if you see a uevent with a UUID that does not match the device you are waiting for, you cannot tell whether it's because the right uevent has not arrived yet, or it was already sent and you missed it. So you cannot tell whether you should wait for it or not. Associating a unique, monotonically increasing sequential number to the lifetime of each block device, which can be retrieved with an ioctl immediately upon setting it up, allows to solve the race conditions with uevents, and also allows userspace processes to know whether they should wait for the uevent they need or if it was dropped and thus they should move on. Additionally, increment the disk sequence number when the media change, i.e. on DISK_EVENT_MEDIA_CHANGE event. Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-2-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/disk-events.c | 3 +++ block/genhd.c | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'block') diff --git a/block/disk-events.c b/block/disk-events.c index a75931ff5da4..04c52f3992ed 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -190,6 +190,9 @@ static void disk_check_events(struct disk_events *ev, spin_unlock_irq(&ev->lock); + if (events & DISK_EVENT_MEDIA_CHANGE) + inc_diskseq(disk); + /* * Tell userland about new events. Only the events listed in * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT diff --git a/block/genhd.c b/block/genhd.c index 38f053074159..ceb08af72c1a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -29,6 +29,23 @@ static struct kobject *block_depr; +/* + * Unique, monotonically increasing sequential number associated with block + * devices instances (i.e. incremented each time a device is attached). + * Associating uevents with block devices in userspace is difficult and racy: + * the uevent netlink socket is lossy, and on slow and overloaded systems has + * a very high latency. + * Block devices do not have exclusive owners in userspace, any process can set + * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 + * can be reused again and again). + * A userspace process setting up a block device and watching for its events + * cannot thus reliably tell whether an event relates to the device it just set + * up or another earlier instance with the same name. + * This sequential number allows userspace processes to solve this problem, and + * uniquely associate an uevent to the lifetime to a device. + */ +static atomic64_t diskseq; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); @@ -1252,6 +1269,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); + inc_diskseq(disk); + return disk; out_destroy_part_tbl: @@ -1352,3 +1371,8 @@ int bdev_read_only(struct block_device *bdev) return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } EXPORT_SYMBOL(bdev_read_only); + +void inc_diskseq(struct gendisk *disk) +{ + disk->diskseq = atomic64_inc_return(&diskseq); +} -- cgit From 87eb710747126ca6606f064deef93d045486ebbe Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:26 +0200 Subject: block: export the diskseq in uevents Export the newly introduced diskseq in uevents: $ udevadm info /sys/class/block/* |grep -e DEVNAME -e DISKSEQ E: DEVNAME=/dev/loop0 E: DISKSEQ=1 E: DEVNAME=/dev/loop1 E: DISKSEQ=2 E: DEVNAME=/dev/loop2 E: DISKSEQ=3 E: DEVNAME=/dev/loop3 E: DISKSEQ=4 E: DEVNAME=/dev/loop4 E: DISKSEQ=5 E: DEVNAME=/dev/loop5 E: DISKSEQ=6 E: DEVNAME=/dev/loop6 E: DISKSEQ=7 E: DEVNAME=/dev/loop7 E: DISKSEQ=8 E: DEVNAME=/dev/nvme0n1 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p1 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p2 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p3 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p4 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p5 E: DISKSEQ=9 E: DEVNAME=/dev/sda E: DISKSEQ=10 E: DEVNAME=/dev/sda1 E: DISKSEQ=10 E: DEVNAME=/dev/sda2 E: DISKSEQ=10 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-3-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/genhd.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ceb08af72c1a..e1b2f898d790 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1092,8 +1092,17 @@ static void disk_release(struct device *dev) blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } + +static int block_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct gendisk *disk = dev_to_disk(dev); + + return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); +} + struct class block_class = { .name = "block", + .dev_uevent = block_uevent, }; static char *block_devnode(struct device *dev, umode_t *mode, -- cgit From 7957d93bf32bc211415827e44fdd9cdf1388df59 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:27 +0200 Subject: block: add ioctl to read the disk sequence number Add a new BLKGETDISKSEQ ioctl which retrieves the disk sequence number from the genhd structure. # ./getdiskseq /dev/loop* /dev/loop0: 13 /dev/loop0p1: 13 /dev/loop0p2: 13 /dev/loop0p3: 13 /dev/loop1: 14 /dev/loop1p1: 14 /dev/loop1p2: 14 /dev/loop2: 5 /dev/loop3: 6 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-4-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 24beec9ca9c9..0c3a4a53fa11 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -469,6 +469,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKGETDISKSEQ: + return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: -- cgit From 13927b31b13f3c6556221eff3487247bd3c7a245 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:28 +0200 Subject: block: export diskseq in sysfs Add a new sysfs handle to export the new diskseq value. Place it in /block//diskseq and document it. $ grep . /sys/class/block/*/diskseq /sys/class/block/loop0/diskseq:13 /sys/class/block/loop1/diskseq:14 /sys/class/block/loop2/diskseq:5 /sys/class/block/loop3/diskseq:6 /sys/class/block/ram0/diskseq:1 /sys/class/block/ram1/diskseq:2 /sys/class/block/vda/diskseq:7 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-5-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/genhd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index e1b2f898d790..a4817e42f3a3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -977,6 +977,14 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static ssize_t diskseq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->diskseq); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -989,6 +997,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); +static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1034,6 +1043,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, + &dev_attr_diskseq.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif -- cgit From e6138dc12de9df17cbda9c40314d69592855ac5e Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:29 +0200 Subject: block: add a helper to raise a media changed event Refactor disk_check_events() and move some code into disk_event_uevent(). Then add disk_force_media_change(), a helper which will be used by devices to force issuing a DISK_EVENT_MEDIA_CHANGE event. Co-developed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-6-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/disk-events.c | 61 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/disk-events.c b/block/disk-events.c index 04c52f3992ed..7445b8ff2775 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -163,15 +163,31 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) spin_unlock_irq(&ev->lock); } +/* + * Tell userland about new events. Only the events listed in @disk->events are + * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are + * processed internally but never get reported to userland. + */ +static void disk_event_uevent(struct gendisk *disk, unsigned int events) +{ + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + int nr_events = 0, i; + + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & disk->events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; - int nr_events = 0, i; /* check events */ events = disk->fops->check_events(disk, clearing); @@ -193,19 +209,8 @@ static void disk_check_events(struct disk_events *ev, if (events & DISK_EVENT_MEDIA_CHANGE) inc_diskseq(disk); - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; - - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) + disk_event_uevent(disk, events); } /** @@ -284,6 +289,32 @@ bool bdev_check_media_change(struct block_device *bdev) } EXPORT_SYMBOL(bdev_check_media_change); +/** + * disk_force_media_change - force a media change event + * @disk: the disk which will raise the event + * @events: the events to raise + * + * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present, + * attempt to free all dentries and inodes and invalidates all block + * device page cache entries in that case. + * + * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not. + */ +bool disk_force_media_change(struct gendisk *disk, unsigned int events) +{ + disk_event_uevent(disk, events); + + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(disk->part0, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &disk->state); + return true; +} +EXPORT_SYMBOL_GPL(disk_force_media_change); + /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. -- cgit From 2bc1f6e442eec88fa60f1ee6bef2c9871227cf8a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 13 Jul 2021 17:18:37 +0900 Subject: block: remove blk-mq-sysfs dead code In block/blk-mq-sysfs.c, struct blk_mq_ctx_sysfs_entry is not used to define any attribute since the "mq" sysfs directory contains only sub-directories (no attribute files). As a result, blk_mq_sysfs_show(), blk_mq_sysfs_store(), and struct sysfs_ops blk_mq_sysfs_ops are all unused and unnecessary. Remove all this unused code. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210713081837.524422-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 55 ---------------------------------------------------- 1 file changed, 55 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 7b52e7657b2d..253c857cba47 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -45,60 +45,12 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) kfree(hctx); } -struct blk_mq_ctx_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_mq_ctx *, char *); - ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); -}; - struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; -static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->show) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->show(ctx, page); - mutex_unlock(&q->sysfs_lock); - return res; -} - -static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(ctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -198,23 +150,16 @@ static struct attribute *default_hw_ctx_attrs[] = { }; ATTRIBUTE_GROUPS(default_hw_ctx); -static const struct sysfs_ops blk_mq_sysfs_ops = { - .show = blk_mq_sysfs_show, - .store = blk_mq_sysfs_store, -}; - static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, .store = blk_mq_hw_sysfs_store, }; static struct kobj_type blk_mq_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_sysfs_release, }; static struct kobj_type blk_mq_ctx_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_ctx_sysfs_release, }; -- cgit From 90b7198001f23ea37d3b46dc631bdaa2357a20b1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 5 Aug 2021 10:41:59 -0700 Subject: blk-mq: Introduce the BLK_MQ_F_NO_SCHED_BY_DEFAULT flag elevator_get_default() uses the following algorithm to select an I/O scheduler from inside add_disk(): - In case of a single hardware queue or if sharing hardware queues across multiple request queues (BLK_MQ_F_TAG_HCTX_SHARED), use mq-deadline. - Otherwise, use 'none'. This is a good choice for most but not for all block drivers. Make it possible to override the selection of mq-deadline with a new flag, namely BLK_MQ_F_NO_SCHED_BY_DEFAULT. Cc: Christoph Hellwig Cc: Ming Lei Cc: Tetsuo Handa Cc: Martijn Coenen Cc: Jaegeuk Kim Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210805174200.3250718-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/elevator.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 52ada14cfe45..d0295e68f481 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -630,6 +630,9 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + if (q->nr_hw_queues != 1 && !blk_mq_is_sbitmap_shared(q->tag_set->flags)) return NULL; -- cgit From c66fd019713e9cf7d6f1243c378cd177d01fe18a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:40 +0200 Subject: block: make the block holder code optional Move the block holder code into a separate file as it is not in any way related to the other block_dev.c code, and add a new selectable config option for it so that we don't have to build it without any remapped drivers selected. The Kconfig symbol contains a _DEPRECATED suffix to match the comments added in commit 49731baa41df ("block: restore multiple bd_link_disk_holder() support"). Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-2-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 4 ++ block/Makefile | 1 + block/holder.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+) create mode 100644 block/holder.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 15dfb7660645..bac87d773c54 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -241,4 +241,8 @@ config BLK_MQ_RDMA config BLK_PM def_bool BLOCK && PM +# do not use in new code +config BLOCK_HOLDER_DEPRECATED + bool + source "block/Kconfig.iosched" diff --git a/block/Makefile b/block/Makefile index c72592b4cf31..0d951adce796 100644 --- a/block/Makefile +++ b/block/Makefile @@ -41,3 +41,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o +obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/holder.c b/block/holder.c new file mode 100644 index 000000000000..904a1dcd5c12 --- /dev/null +++ b/block/holder.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + mutex_lock(&bdev->bd_disk->open_mutex); + + WARN_ON_ONCE(!bdev->bd_holder); + + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) + goto out_unlock; + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + goto out_free; + + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_holder_dir); + + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; + +out_del: + del_symlink(disk->slave_dir, bdev_kobj(bdev)); +out_free: + kfree(holder); +out_unlock: + mutex_unlock(&bdev->bd_disk->open_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + mutex_lock(&bdev->bd_disk->open_mutex); + holder = bd_find_holder_disk(bdev, disk); + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + mutex_unlock(&bdev->bd_disk->open_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -- cgit From fbd9a39542ecdd2ade55869c13856b2590db3df8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:41 +0200 Subject: block: remove the extra kobject reference in bd_link_disk_holder Since commit 0d02129e76ed ("block: merge struct block_device and struct hd_struct") there is no way for the bdev to go away as long as there is a holder, so remove the extra references. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-3-hch@lst.de Signed-off-by: Jens Axboe --- block/holder.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'block') diff --git a/block/holder.c b/block/holder.c index 904a1dcd5c12..960654a71342 100644 --- a/block/holder.c +++ b/block/holder.c @@ -92,11 +92,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -130,7 +125,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } -- cgit From 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:42 +0200 Subject: block: look up holders by bdev Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804094147.459763-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 +++- block/holder.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index a4817e42f3a3..cd4eab744667 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1289,7 +1289,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); - +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + INIT_LIST_HEAD(&disk->slave_bdevs); +#endif return disk; out_destroy_part_tbl: diff --git a/block/holder.c b/block/holder.c index 960654a71342..11e65d99a9fb 100644 --- a/block/holder.c +++ b/block/holder.c @@ -3,7 +3,7 @@ struct bd_holder_disk { struct list_head list; - struct gendisk *disk; + struct block_device *bdev; int refcnt; }; @@ -12,8 +12,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder; - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) + list_for_each_entry(holder, &disk->slave_bdevs, list) + if (holder->bdev == bdev) return holder; return NULL; } @@ -61,7 +61,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -82,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list); - holder->disk = disk; + holder->bdev = bdev; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -93,7 +93,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) if (ret) goto out_del; - list_add(&holder->list, &bdev->bd_holder_disks); + list_add(&holder->list, &disk->slave_bdevs); goto out_unlock; out_del: @@ -101,7 +101,7 @@ out_del: out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -120,7 +120,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -128,6 +128,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) list_del_init(&holder->list); kfree(holder); } - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -- cgit From d626338735909bc2b2e7cafc332f44ed41cfdeee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:43 +0200 Subject: block: support delayed holder registration device mapper needs to register holders before it is ready to do I/O. Currently it does so by registering the disk early, which can leave the disk and queue in a weird half state where the queue is registered with the disk, except for sysfs and the elevator. And this state has been a bit promlematic before, and will get more so when sorting out the responsibilities between the queue and the disk. Support registering holders on an initialized but not registered disk instead by delaying the sysfs registration until the disk is registered. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 10 +++++++++ block/holder.c | 68 +++++++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index cd4eab744667..db916f779077 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -447,6 +447,16 @@ static void register_disk(struct device *parent, struct gendisk *disk, kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + /* + * XXX: this is a mess, can't wait for real error handling in add_disk. + * Make sure ->slave_dir is NULL if we failed some of the registration + * so that the cleanup in bd_unlink_disk_holder works properly. + */ + if (bd_register_pending_holders(disk) < 0) { + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; + } + if (disk->flags & GENHD_FL_HIDDEN) return; diff --git a/block/holder.c b/block/holder.c index 11e65d99a9fb..4568cc4f6827 100644 --- a/block/holder.c +++ b/block/holder.c @@ -28,6 +28,19 @@ static void del_symlink(struct kobject *from, struct kobject *to) sysfs_remove_link(from, kobject_name(to)); } +static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + int ret; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + return ret; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + return ret; +} + /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev @@ -66,7 +79,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) + if (WARN_ON(!bdev->bd_holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); @@ -84,28 +97,28 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) INIT_LIST_HEAD(&holder->list); holder->bdev = bdev; holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; + if (disk->slave_dir) { + ret = __link_disk_holder(bdev, disk); + if (ret) { + kfree(holder); + goto out_unlock; + } + } list_add(&holder->list, &disk->slave_bdevs); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); out_unlock: mutex_unlock(&disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); +static void __unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); +} + /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev @@ -123,11 +136,32 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (disk->slave_dir) + __unlink_disk_holder(bdev, disk); list_del_init(&holder->list); kfree(holder); } mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); + +int bd_register_pending_holders(struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret; + + mutex_lock(&disk->open_mutex); + list_for_each_entry(holder, &disk->slave_bdevs, list) { + ret = __link_disk_holder(holder->bdev, disk); + if (ret) + goto out_undo; + } + mutex_unlock(&disk->open_mutex); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) + __unlink_disk_holder(holder->bdev, disk); + mutex_unlock(&disk->open_mutex); + return ret; +} -- cgit From d1254a8749711e0d7441036a74ce592341f89697 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:47 +0200 Subject: block: remove support for delayed queue registrations Now that device mapper has been changed to register the disk once it is fully ready all this code is unused. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-9-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 1 - block/genhd.c | 29 +++++++---------------------- 2 files changed, 7 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index d0295e68f481..9beaafd238e0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -705,7 +705,6 @@ void elevator_init_mq(struct request_queue *q) elevator_put(e); } } -EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */ /* * switch to new_e io scheduler. be careful not to introduce deadlocks - diff --git a/block/genhd.c b/block/genhd.c index db916f779077..b0b6e0caa389 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -475,20 +475,20 @@ static void register_disk(struct device *parent, struct gendisk *disk, } /** - * __device_add_disk - add disk information to kernel list + * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups - * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. * * FIXME: error handling */ -static void __device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - bool register_queue) + +void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + { int ret; @@ -498,8 +498,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * elevator if one is needed, that is, for devices requesting queue * registration. */ - if (register_queue) - elevator_init_mq(disk->queue); + elevator_init_mq(disk->queue); /* * If the driver provides an explicit major number it also must provide @@ -553,8 +552,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); - if (register_queue) - blk_register_queue(disk); + blk_register_queue(disk); /* * Take an extra ref on queue which will be put on disk_release() @@ -568,21 +566,8 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk_add_events(disk); blk_integrity_add(disk); } - -void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) - -{ - __device_add_disk(parent, disk, groups, true); -} EXPORT_SYMBOL(device_add_disk); -void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) -{ - __device_add_disk(parent, disk, NULL, false); -} -EXPORT_SYMBOL(device_add_disk_no_queue_reg); - /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove -- cgit From 5ed964f8e54eb3191b8b7b45aeb52672a0c995dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:40 +0200 Subject: mm: hide laptop_mode_wb_timer entirely behind the BDI API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't leak the detaіls of the timer into the block layer, instead initialize the timer in bdi_alloc and delete it in bdi_unregister. Note that this means the timer is initialized (but not armed) for non-block queues as well now. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 04477697ee4b..5897bc37467d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -394,10 +394,7 @@ void blk_cleanup_queue(struct request_queue *q) /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); - /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); - if (queue_is_mq(q)) blk_mq_exit_queue(q); @@ -546,8 +543,6 @@ struct request_queue *blk_alloc_queue(int node_id) atomic_set(&q->nr_active_requests_shared_sbitmap, 0); - timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, - laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); -- cgit From 471aa704db4904f7af5a50019ca3b5b018c0cf62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:41 +0200 Subject: block: pass a gendisk to blk_queue_update_readahead .. and rename the function to disk_update_readahead. This is in preparation for moving the BDI from the request_queue to the gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 +++++--- block/blk-sysfs.c | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 109012719aa0..44aaef9bf736 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -380,8 +380,10 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); -void blk_queue_update_readahead(struct request_queue *q) +void disk_update_readahead(struct gendisk *disk) { + struct request_queue *q = disk->queue; + /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. @@ -391,7 +393,7 @@ void blk_queue_update_readahead(struct request_queue *q) q->backing_dev_info->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } -EXPORT_SYMBOL_GPL(blk_queue_update_readahead); +EXPORT_SYMBOL_GPL(disk_update_readahead); /** * blk_limits_io_min - set minimum request size for a device @@ -665,7 +667,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, pr_notice("%s: Warning: Device %pg is misaligned\n", disk->disk_name, bdev); - blk_queue_update_readahead(disk->queue); + disk_update_readahead(disk); } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 370d83c18057..3af2ab7d5086 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -866,7 +866,7 @@ int blk_register_queue(struct gendisk *disk) "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - blk_queue_update_readahead(q); + disk_update_readahead(disk); ret = blk_trace_init_sysfs(dev); if (ret) -- cgit From edb0872f44ec9976ea6d052cb4b93cd2d23ac2ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:43 +0200 Subject: block: move the bdi from the request_queue to the gendisk The backing device information only makes sense for file system I/O, and thus belongs into the gendisk and not the lower level request_queue structure. Move it there. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 ++-- block/blk-cgroup.c | 7 +++---- block/blk-core.c | 13 +++---------- block/blk-mq.c | 2 +- block/blk-settings.c | 14 +++++++++----- block/blk-sysfs.c | 26 ++++++++++++-------------- block/blk-wbt.c | 10 +++++----- block/genhd.c | 23 ++++++++++++++--------- 8 files changed, 49 insertions(+), 50 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 727955918563..1576e858d3a5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5266,8 +5266,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), - ioprio_class); + bdi_dev_name(queue_to_disk(bfqq->bfqd->queue)->bdi), + ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: /* diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 575d7a2e7203..db034e35ae20 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -489,10 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info->dev) - return bdi_dev_name(blkg->q->backing_dev_info); - return NULL; + if (!queue_has_disk(blkg->q) || !queue_to_disk(blkg->q)->bdi->dev) + return NULL; + return bdi_dev_name(queue_to_disk(blkg->q)->bdi); } /** diff --git a/block/blk-core.c b/block/blk-core.c index 5897bc37467d..0874bc2fcdb4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -14,7 +14,6 @@ */ #include #include -#include #include #include #include @@ -531,13 +530,9 @@ struct request_queue *blk_alloc_queue(int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc(node_id); - if (!q->backing_dev_info) - goto fail_split; - q->stats = blk_alloc_queue_stats(); if (!q->stats) - goto fail_stats; + goto fail_split; q->node = node_id; @@ -567,7 +562,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto fail_bdi; + goto fail_stats; if (blkcg_init_queue(q)) goto fail_ref; @@ -580,10 +575,8 @@ struct request_queue *blk_alloc_queue(int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); -fail_bdi: - blk_free_queue_stats(q->stats); fail_stats: - bdi_put(q->backing_dev_info); + blk_free_queue_stats(q->stats); fail_split: bioset_exit(&q->bio_split); fail_id: diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c4ac51e54eb..d2725f94491d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->backing_dev_info); + laptop_io_completion(queue_to_disk(q)->bdi); rq_qos_done(q, rq); diff --git a/block/blk-settings.c b/block/blk-settings.c index 44aaef9bf736..3613d2cc0688 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -140,7 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); + if (!queue_has_disk(q)) + return; + queue_to_disk(q)->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -388,10 +391,9 @@ void disk_update_readahead(struct gendisk *disk) * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. */ - q->backing_dev_info->ra_pages = + disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - q->backing_dev_info->io_pages = - queue_max_sectors(q) >> (PAGE_SHIFT - 9); + disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL_GPL(disk_update_readahead); @@ -473,7 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - q->backing_dev_info->ra_pages = + if (!queue_has_disk(q)) + return; + queue_to_disk(q)->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3af2ab7d5086..1832587dce3a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -88,9 +88,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info->ra_pages << - (PAGE_SHIFT - 10); + unsigned long ra_kb; + if (!queue_has_disk(q)) + return -EINVAL; + ra_kb = queue_to_disk(q)->bdi->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, page); } @@ -98,13 +100,14 @@ static ssize_t queue_ra_store(struct request_queue *q, const char *page, size_t count) { unsigned long ra_kb; - ssize_t ret = queue_var_store(&ra_kb, page, count); + ssize_t ret; + if (!queue_has_disk(q)) + return -EINVAL; + ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); - + queue_to_disk(q)->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -251,7 +254,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); + if (queue_has_disk(q)) + queue_to_disk(q)->bdi->io_pages = + max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; @@ -766,13 +771,6 @@ static void blk_exit_queue(struct request_queue *q) * e.g. blkcg_print_blkgs() to crash. */ blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); } /** diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 3ed71b8da887..31086afaad9c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; + struct bdi_writeback *wb = &queue_to_disk(rwb->rqos.q)->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,8 +359,8 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, - inflight); + trace_wbt_timer(queue_to_disk(rwb->rqos.q)->bdi, status, + rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, diff --git a/block/genhd.c b/block/genhd.c index b0b6e0caa389..f8def1129501 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -466,10 +466,9 @@ static void register_disk(struct device *parent, struct gendisk *disk, dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); - if (disk->queue->backing_dev_info->dev) { - err = sysfs_create_link(&ddev->kobj, - &disk->queue->backing_dev_info->dev->kobj, - "bdi"); + if (disk->bdi->dev) { + err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, + "bdi"); WARN_ON(err); } } @@ -540,15 +539,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct backing_dev_info *bdi = disk->queue->backing_dev_info; struct device *dev = disk_to_dev(disk); /* Register BDI before referencing it from bdev */ dev->devt = MKDEV(disk->major, disk->first_minor); - ret = bdi_register(bdi, "%u:%u", + ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); WARN_ON(ret); - bdi_set_owner(bdi, dev); + bdi_set_owner(disk->bdi, dev); bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); @@ -615,7 +613,7 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + bdi_unregister(disk->bdi); } blk_unregister_queue(disk); @@ -1088,6 +1086,7 @@ static void disk_release(struct device *dev) might_sleep(); + bdi_put(disk->bdi); if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); @@ -1268,9 +1267,13 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; + disk->bdi = bdi_alloc(node_id); + if (!disk->bdi) + goto out_free_disk; + disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) - goto out_free_disk; + goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); @@ -1292,6 +1295,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) out_destroy_part_tbl: xa_destroy(&disk->part_tbl); iput(disk->part0->bd_inode); +out_free_bdi: + bdi_put(disk->bdi); out_free_disk: kfree(disk); return NULL; -- cgit From a11d7fc2d05fb509cd9e33d4093507d6eda3ad53 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:44 +0200 Subject: block: remove the bd_bdi in struct block_device Just retrieve the bdi from the disk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-6-hch@lst.de Signed-off-by: Jens Axboe --- block/ioctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 0c3a4a53fa11..fff161eaab42 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -506,7 +506,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: return blkdev_reread_part(bdev, mode); @@ -556,7 +556,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRAGET: if (!argp) return -EINVAL; - return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + return put_long(argp, + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -628,7 +629,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!argp) return -EINVAL; return compat_put_long(argp, - (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) -- cgit From 866663b7b52d2da267b28e12eed89ee781b8fed1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 29 Jul 2021 11:42:26 +0800 Subject: block: return ELEVATOR_DISCARD_MERGE if possible When merging one bio to request, if they are discard IO and the queue supports multi-range discard, we need to return ELEVATOR_DISCARD_MERGE because both block core and related drivers(nvme, virtio-blk) doesn't handle mixed discard io merge(traditional IO merge together with discard merge) well. Fix the issue by returning ELEVATOR_DISCARD_MERGE in this situation, so both blk-mq and drivers just need to handle multi-range discard. Reported-by: Oleksandr Natalenko Signed-off-by: Ming Lei Tested-by: Oleksandr Natalenko Fixes: 2705dfb20947 ("block: fix discard request merge") Link: https://lore.kernel.org/r/20210729034226.1591070-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +++ block/blk-merge.c | 16 ---------------- block/elevator.c | 3 +++ block/mq-deadline-main.c | 2 ++ 4 files changed, 8 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1576e858d3a5..e4a61eda2d0f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2361,6 +2361,9 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } diff --git a/block/blk-merge.c b/block/blk-merge.c index a11b3b53717e..f8707ff7e2fc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -705,22 +705,6 @@ static void blk_account_io_merge_request(struct request *req) } } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static enum elv_merge blk_try_req_merge(struct request *req, struct request *next) { diff --git a/block/elevator.c b/block/elevator.c index 9beaafd238e0..ff45d8388f48 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -336,6 +336,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 6f612e6dc82b..294be0c0db65 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -677,6 +677,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } -- cgit From 50b4aecfbbb09869db967e4a26212a47e10c0088 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:28 +0200 Subject: block: remove GENHD_FL_UP Just check inode_unhashed on the whole device bdev inode instead, and provide a helper to check for that information. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- block/partitions/core.c | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index f8def1129501..9d6b3aeea288 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -77,7 +77,8 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) * initial capacity during probing. */ if (size == capacity || - (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + !disk_live(disk) || + (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info("%s: detected capacity change from %lld to %lld\n", @@ -527,8 +528,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_EXT_DEVT; } - disk->flags |= GENHD_FL_UP; - disk_alloc_events(disk); if (disk->flags & GENHD_FL_HIDDEN) { @@ -597,7 +596,6 @@ void del_gendisk(struct gendisk *disk) mutex_lock(&disk->open_mutex); remove_inode_hash(disk->part0->bd_inode); - disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); diff --git a/block/partitions/core.c b/block/partitions/core.c index fb3a556cacce..c6738ccbcee5 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -459,7 +459,7 @@ int bdev_add_partition(struct block_device *bdev, int partno, int ret; mutex_lock(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) { + if (!disk_live(disk)) { ret = -ENXIO; goto out; } @@ -669,7 +669,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate) lockdep_assert_held(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) return -ENXIO; rescan: -- cgit From a08aa9bccdc282b5e8d133bf8c239473f057b464 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:09 +0200 Subject: block: store a gendisk in struct parsed_partitions Partition scanning only happens on the whole device, so pass a struct gendisk instead of the whole device block_device to the scanners. This allows to simplify printing the device name in various places as the disk name is available in disk->name. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland Link: https://lore.kernel.org/r/20210810154512.1809898-2-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/acorn.c | 4 ++-- block/partitions/aix.c | 20 ++------------------ block/partitions/amiga.c | 7 +++---- block/partitions/atari.c | 4 ++-- block/partitions/check.h | 2 +- block/partitions/cmdline.c | 6 ++---- block/partitions/core.c | 6 +++--- block/partitions/efi.c | 36 +++++++++++++++++------------------- block/partitions/ibm.c | 4 ++-- block/partitions/ldm.c | 18 +++++++++--------- block/partitions/mac.c | 2 +- block/partitions/msdos.c | 6 ++++-- block/partitions/sgi.c | 5 ++--- block/partitions/sun.c | 5 ++--- 14 files changed, 52 insertions(+), 73 deletions(-) (limited to 'block') diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index c64c57b958bf..2c381c694c57 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -275,7 +275,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) /* * Work out start of non-adfs partition. */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = get_capacity(state->disk) - start_sect; if (start_sect) { switch (id) { @@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) if (i != 0) { sector_t size; - size = get_capacity(state->bdev->bd_disk); + size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); strlcat(state->pp_buf, "\n", PAGE_SIZE); } diff --git a/block/partitions/aix.c b/block/partitions/aix.c index c7b4fd1a4a97..85f4b967565e 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -66,22 +66,6 @@ struct pvd { #define LVM_MAXLVS 256 -/** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; -} - /** * read_lba(): Read bytes from disk, starting at given LBA * @state @@ -89,7 +73,7 @@ static u64 last_lba(struct block_device *bdev) * @buffer * @count * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, @@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, { size_t totalreadcount = 0; - if (!buffer || lba + count / 512 > last_lba(state->bdev)) + if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL) return 0; while (count) { diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 9526491d9aed..5c8624e26a54 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -34,7 +34,6 @@ int amiga_partition(struct parsed_partitions *state) int start_sect, nr_sects, blk, part, res = 0; int blksize = 1; /* Multiplier for disk block size */ int slot = 1; - char b[BDEVNAME_SIZE]; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) @@ -42,7 +41,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ int amiga_partition(struct parsed_partitions *state) } pr_err("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ @@ -84,7 +83,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 2305840c8522..da5994175416 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state) * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ - if (bdev_logical_block_size(state->bdev) != 512) + if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, §); @@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state) return -1; /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; + hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && diff --git a/block/partitions/check.h b/block/partitions/check.h index c577e9ee67f0..d5b28e309d64 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -9,7 +9,7 @@ * description. */ struct parsed_partitions { - struct block_device *bdev; + struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 482a29e95dbd..1af610f0ba8c 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -380,7 +380,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; - char bdev[BDEVNAME_SIZE]; struct cmdline_parts *parts; if (cmdline) { @@ -397,12 +396,11 @@ int cmdline_partition(struct parsed_partitions *state) if (!bdev_parts) return 0; - bdevname(state->bdev, bdev); - parts = cmdline_parts_find(bdev_parts, bdev); + parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; - disk_size = get_capacity(state->bdev->bd_disk) << 9; + disk_size = get_capacity(state->disk) << 9; cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); diff --git a/block/partitions/core.c b/block/partitions/core.c index c6738ccbcee5..5dd1cd1a163d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -135,7 +135,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) } state->pp_buf[0] = '\0'; - state->bdev = hd->part0; + state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -717,10 +717,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = state->bdev->bd_inode->i_mapping; + struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; struct page *page; - if (n >= get_capacity(state->bdev->bd_disk)) { + if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; return NULL; } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index e2716792ecc1..aaa3dc487cb5 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len) /** * last_lba(): return number of last logical block of device - * @bdev: block device + * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 last_lba(struct block_device *bdev) +static u64 last_lba(struct gendisk *disk) { - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; + return div_u64(disk->part0->bd_inode->i_size, + queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) @@ -231,17 +229,17 @@ done: * @buffer: destination buffer * @count: bytes to read * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + sector_t n = lba * + (queue_logical_block_size(state->disk->queue) / 512); - if (!buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { @@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. + * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); + unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) @@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { + queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); + queue_logical_block_size(state->disk->queue)); goto fail; } @@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -587,13 +585,13 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; - sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); @@ -705,7 +703,7 @@ int efi_partition(struct parsed_partitions *state) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -722,7 +720,7 @@ int efi_partition(struct parsed_partitions *state) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 4b044e620d35..9bca396aef4a 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -290,8 +290,8 @@ static int find_cms1_partitions(struct parsed_partitions *state, int ibm_partition(struct parsed_partitions *state) { int (*fn)(struct gendisk *disk, dasd_information2_t *info); - struct block_device *bdev = state->bdev; - struct gendisk *disk = bdev->bd_disk; + struct gendisk *disk = state->disk; + struct block_device *bdev = disk->part0; int blocksize, res; loff_t i_size, offset, size; dasd_information2_t *info; diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index cc86534c80ad..a6f0c9eaebe9 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state, } } - num_sects = state->bdev->bd_inode->i_size >> 9; + num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -339,11 +339,11 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. + * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * @@ -486,8 +486,8 @@ out: * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->disk is a dynamic disk + * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { @@ -1340,7 +1340,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1432,10 +1432,10 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk + * Return: 1 Success, @state->disk is a dynamic disk and we handled it + * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted + * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b6095335636c..7b521df00a39 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, + note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index f5102596a984..b5d5c229cc3b 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -135,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state, Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; + sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; @@ -579,7 +580,7 @@ static struct { int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; @@ -587,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state) int slot; u32 disksig; + sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 4273f1bb0515..9cc6b8c1eea4 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state) Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; - char b[BDEVNAME_SIZE]; label = read_part_sector(state, 0, §); if (!label) @@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state) magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ + state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } @@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 47dc53eccf77..ddf9e6def4b2 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state) } * label; struct sun_partition *p; unsigned long spc; - char b[BDEVNAME_SIZE]; int use_vtoc; int nparts; @@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ + state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } @@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } -- cgit From 7f6be3765e113e0d4b8e6b65e1074982de94377e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:10 +0200 Subject: block: pass a gendisk to bdev_add_partition bdev_add_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 ++-- block/ioctl.c | 3 ++- block/partitions/core.c | 5 ++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 56f33fbcde59..c0486f609978 100644 --- a/block/blk.h +++ b/block/blk.h @@ -347,8 +347,8 @@ void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); diff --git a/block/ioctl.c b/block/ioctl.c index fff161eaab42..36e0ec76b3b2 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -16,6 +16,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { + struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; long long start, length; @@ -40,7 +41,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, /* check if partition is aligned to blocksize */ if (p.start & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - return bdev_add_partition(bdev, p.pno, start, length); + return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: return bdev_resize_partition(bdev, p.pno, start, length); default: diff --git a/block/partitions/core.c b/block/partitions/core.c index 5dd1cd1a163d..7b227c114297 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -451,11 +451,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, return overlap; } -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part; - struct gendisk *disk = bdev->bd_disk; int ret; mutex_lock(&disk->open_mutex); -- cgit From 926fbb1677e0d963dd96dae3c0305e855590d524 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:11 +0200 Subject: block: pass a gendisk to bdev_del_partition bdev_del_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/ioctl.c | 2 +- block/partitions/core.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index c0486f609978..21c441eb6773 100644 --- a/block/blk.h +++ b/block/blk.h @@ -349,7 +349,7 @@ void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_WHOLEDISK 2 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); -int bdev_del_partition(struct block_device *bdev, int partno); +int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); diff --git a/block/ioctl.c b/block/ioctl.c index 36e0ec76b3b2..8f57b276b2f1 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -31,7 +31,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; if (op == BLKPG_DEL_PARTITION) - return bdev_del_partition(bdev, p.pno); + return bdev_del_partition(disk, p.pno); start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; diff --git a/block/partitions/core.c b/block/partitions/core.c index 7b227c114297..8c7abf0ee0ea 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -476,13 +476,13 @@ out: return ret; } -int bdev_del_partition(struct block_device *bdev, int partno) +int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -493,7 +493,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) delete_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } -- cgit From 3d2e79894bd7adc7d14638a0c72ceb8b722d1fa3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:12 +0200 Subject: block: pass a gendisk to bdev_resize_partition bdev_resize_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 ++-- block/ioctl.c | 2 +- block/partitions/core.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 21c441eb6773..db6f82bbb683 100644 --- a/block/blk.h +++ b/block/blk.h @@ -350,8 +350,8 @@ void blk_free_ext_minor(unsigned int minor); int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/ioctl.c b/block/ioctl.c index 8f57b276b2f1..eb0491e90b9a 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -43,7 +43,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: - return bdev_resize_partition(bdev, p.pno, start, length); + return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 8c7abf0ee0ea..9265936df77e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -497,14 +497,14 @@ out_unlock: return ret; } -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -513,14 +513,14 @@ int bdev_resize_partition(struct block_device *bdev, int partno, goto out_unlock; ret = -EBUSY; - if (partition_overlaps(bdev->bd_disk, start, length, partno)) + if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } -- cgit From 4f1e9630afe6332de7286820fedd019f19eac057 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Mon, 2 Aug 2021 11:51:56 +0800 Subject: blk-throtl: optimize IOPS throttle for large IO scenarios After patch 54efd50 (block: make generic_make_request handle arbitrarily sized bios), the IO through io-throttle may be larger, and these IOs may be further split into more small IOs. However, IOPS throttle does not seem to be aware of this change, which makes the calculation of IOPS of large IOs incomplete, resulting in disk-side IOPS that does not meet expectations. Maybe we should fix this problem. We can reproduce it by set max_sectors_kb of disk to 128, set blkio.write_iops_throttle to 100, run a dd instance inside blkio and use iostat to watch IOPS: dd if=/dev/zero of=/dev/sdb bs=1M count=1000 oflag=direct As a result, without this change the average IOPS is 1995, with this change the IOPS is 98. Signed-off-by: Chunguang Xu Acked-by: Tejun Heo Link: https://lore.kernel.org/r/65869aaad05475797d63b4c3fed4f529febe3c26.1627876014.git.brookxu@tencent.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 ++ block/blk-throttle.c | 32 ++++++++++++++++++++++++++++++++ block/blk.h | 2 ++ 3 files changed, 36 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index f8707ff7e2fc..eeba8422ae82 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; + + blk_throtl_charge_bio_split(*bio); } } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b1b22d863bdf..55c49015e533 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -178,6 +178,9 @@ struct throtl_grp { unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -777,6 +780,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + atomic_set(&tg->io_split_cnt[rw], 0); + /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -799,6 +804,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + + atomic_set(&tg->io_split_cnt[rw], 0); + throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -1031,6 +1039,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } + if (iops_limit != UINT_MAX) + tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -2052,12 +2063,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg) } if (tg->iops[READ][LIMIT_LOW]) { + tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { + tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2176,6 +2189,25 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +void blk_throtl_charge_bio_split(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_grp *parent = blkg_to_tg(blkg); + struct throtl_service_queue *parent_sq; + bool rw = bio_data_dir(bio); + + do { + if (!parent->has_rules[rw]) + break; + + atomic_inc(&parent->io_split_cnt[rw]); + atomic_inc(&parent->last_io_split_cnt[rw]); + + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + } while (parent); +} + bool blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; diff --git a/block/blk.h b/block/blk.h index db6f82bbb683..148bdcd3aa08 100644 --- a/block/blk.h +++ b/block/blk.h @@ -293,11 +293,13 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); +extern void blk_throtl_charge_bio_split(struct bio *bio); bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW -- cgit From 9451aa0aacaf7ea13d1acfd5de8b63a6e0b24fac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:26:13 +0200 Subject: block: free the extended dev_t minor later The dev_t is used as the inode hash, so we should only released it once then block device inode is gone from the inode cache. Move it to bdev_free_inode to ensure that. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816122614.601358-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 -- block/partitions/core.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9d6b3aeea288..ed58ddf6258b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1085,8 +1085,6 @@ static void disk_release(struct device *dev) might_sleep(); bdi_put(disk->bdi); - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); diff --git a/block/partitions/core.c b/block/partitions/core.c index 9265936df77e..58c4c362c94f 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -259,8 +259,6 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); put_disk(dev_to_bdev(dev)->bd_disk); iput(dev_to_bdev(dev)->bd_inode); } -- cgit From 889c05cc5834a1eef2dbe1e639cfd7a81c4f4c6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:26:14 +0200 Subject: block: ensure the bdi is freed after inode_detach_wb inode_detach_wb references the "main" bdi of the inode. With the recent change to move the bdi from the request_queue to the gendisk this causes a guaranteed use after free when using certain cgroup configurations. The big itself is older through as any non-default inode reference (e.g. an open file descriptor) could have injected this use after free even before that. Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Reported-by: Qian Cai Reported-by: syzbot Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816122614.601358-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ed58ddf6258b..731a46063132 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1084,7 +1084,6 @@ static void disk_release(struct device *dev) might_sleep(); - bdi_put(disk->bdi); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); -- cgit From b93ef45350c0119ddc275601438c89231b198414 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:21 +0200 Subject: block: use bvec_virt in bio_integrity_{process,free} Use the bvec_virt helper to clean up the bio integrity processing a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Acked-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210804095634.460779-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8f54d49dc500..6b47cddbbca1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -104,8 +104,7 @@ void bio_integrity_free(struct bio *bio) struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) - kfree(page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset); + kfree(bvec_virt(bip->bip_vec)); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -163,13 +162,11 @@ static blk_status_t bio_integrity_process(struct bio *bio, struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; - void *prot_buf = page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; - iter.prot_buf = prot_buf; + iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); -- cgit From 49cb5168a7c6abf9835f9acdce6263bc2deefeb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:26:22 +0200 Subject: blk-cgroup: refactor blkcg_print_stat Factor out a helper to deal with a single blkcg_gq to make the code a little bit easier to follow. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210810152623.1796144-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 148 ++++++++++++++++++++++++++--------------------------- 1 file changed, 74 insertions(+), 74 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index db034e35ae20..52aa0540ccaf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -870,97 +870,97 @@ static void blkcg_fill_root_iostats(void) } } -static int blkcg_print_stat(struct seq_file *sf, void *v) +static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct blkcg_gq *blkg; - - if (!seq_css(sf)->parent) - blkcg_fill_root_iostats(); - else - cgroup_rstat_flush(blkcg->css.cgroup); - - rcu_read_lock(); + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + bool has_stats = false; + const char *dname; + unsigned seq; + char *buf; + size_t size = seq_get_buf(s, &buf), off = 0; + int i; - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkg_iostat_set *bis = &blkg->iostat; - const char *dname; - char *buf; - u64 rbytes, wbytes, rios, wios, dbytes, dios; - size_t size = seq_get_buf(sf, &buf), off = 0; - int i; - bool has_stats = false; - unsigned seq; + if (!blkg->online) + return; - spin_lock_irq(&blkg->q->queue_lock); + dname = blkg_dev_name(blkg); + if (!dname) + return; - if (!blkg->online) - goto skip; + /* + * Hooray string manipulation, count is the size written NOT + * INCLUDING THE \0, so size is now count+1 less than what we + * had before, but we want to start writing the next bit from + * the \0 so we only add count to buf. + */ + off += scnprintf(buf+off, size-off, "%s ", dname); - dname = blkg_dev_name(blkg); - if (!dname) - goto skip; + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + has_stats = true; + off += scnprintf(buf+off, size-off, + "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + has_stats = true; + off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); + } - do { - seq = u64_stats_fetch_begin(&bis->sync); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + size_t written; - rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; - wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; - dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; - rios = bis->cur.ios[BLKG_IOSTAT_READ]; - wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; - dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; - } while (u64_stats_fetch_retry(&bis->sync, seq)); + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; - if (rbytes || wbytes || rios || wios) { + written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); + if (written) has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", - rbytes, wbytes, rios, wios, - dbytes, dios); - } + off += written; + } - if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; - off += scnprintf(buf+off, size-off, - " use_delay=%d delay_nsec=%llu", - atomic_read(&blkg->use_delay), - (unsigned long long)atomic64_read(&blkg->delay_nsec)); + if (has_stats) { + if (off < size - 1) { + off += scnprintf(buf+off, size-off, "\n"); + seq_commit(s, off); + } else { + seq_commit(s, -1); } + } +} - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; - - if (!blkg->pd[i] || !pol->pd_stat_fn) - continue; +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) - has_stats = true; - off += written; - } + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); - } else { - seq_commit(sf, -1); - } - } - skip: + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(&blkg->q->queue_lock); + blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } - rcu_read_unlock(); return 0; } -- cgit From 252c651a4c854b328445a536bd1892e999103fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:26:23 +0200 Subject: blk-cgroup: stop using seq_get_buf seq_get_buf is a crutch that undoes all the memory safety of the seq_file interface. Use the normal seq_printf interfaces instead. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210810152623.1796144-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 30 ++++++------------------------ block/blk-iocost.c | 23 +++++++++-------------- block/blk-iolatency.c | 38 +++++++++++++++++++------------------- block/mq-deadline-cgroup.c | 8 +++----- 4 files changed, 37 insertions(+), 62 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 52aa0540ccaf..b8ec47dcce42 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -877,8 +877,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) bool has_stats = false; const char *dname; unsigned seq; - char *buf; - size_t size = seq_get_buf(s, &buf), off = 0; int i; if (!blkg->online) @@ -888,13 +886,7 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (!dname) return; - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); + seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); @@ -909,40 +901,30 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (rbytes || wbytes || rios || wios) { has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { has_stats = true; - off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", + seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) + if (pol->pd_stat_fn(blkg->pd[i], s)) has_stats = true; - off += written; } - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(s, off); - } else { - seq_commit(s, -1); - } - } + if (has_stats) + seq_printf(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5fac3757e6e0..89b21a360b2c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2988,34 +2988,29 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; - size_t pos = 0; if (!ioc->enabled) - return 0; + return false; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); - pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", - vp10k / 100, vp10k % 100); + seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } - pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", - iocg->last_stat.usage_us); + seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) - pos += scnprintf(buf + pos, size - pos, - " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", - iocg->last_stat.wait_us, - iocg->last_stat.indebt_us, - iocg->last_stat.indelay_us); - - return pos; + seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 81be0096411d..4c06fafb7411 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -886,8 +886,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, - size_t size) +static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -902,39 +901,40 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, preempt_enable(); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " missed=%llu total=%llu depth=max", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total); - return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + seq_printf(s, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + else + seq_printf(s, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); + return true; } -static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, - size_t size) +static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return 0; + return false; if (iolat->ssd) - return iolatency_ssd_stat(iolat, buf, size); + return iolatency_ssd_stat(iolat, s); avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", - avg_lat, cur_win); - - return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + seq_printf(s, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); + return true; } - static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c index 3b4bfddec39f..b48a4b962f90 100644 --- a/block/mq-deadline-cgroup.c +++ b/block/mq-deadline-cgroup.c @@ -52,7 +52,7 @@ struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) return dd_blkcg_from_pd(pd); } -static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool dd_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { static const char *const prio_class_name[] = { [IOPRIO_CLASS_NONE] = "NONE", @@ -61,12 +61,10 @@ static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) [IOPRIO_CLASS_IDLE] = "IDLE", }; struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); - int res = 0; u8 prio; for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) - res += scnprintf(buf + res, size - res, - " [%s] dispatched=%u inserted=%u merged=%u", + seq_printf(s, " [%s] dispatched=%u inserted=%u merged=%u", prio_class_name[prio], ddcg_sum(blkcg, dispatched, prio) + ddcg_sum(blkcg, merged, prio) - @@ -75,7 +73,7 @@ static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) ddcg_sum(blkcg, completed, prio), ddcg_sum(blkcg, merged, prio)); - return res; + return true; } static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, -- cgit From 69f87cc7086558ad84f20001256474aa611fc0eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:36:49 +0200 Subject: block: unexport blk_register_queue Not actually used in any modular code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816123649.601591-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1832587dce3a..586507a5b8c2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -939,7 +939,6 @@ unlock: return ret; } -EXPORT_SYMBOL_GPL(blk_register_queue); /** * blk_unregister_queue - counterpart of blk_register_queue() -- cgit From a680dd72ec336b81511e3bff48efac6dbfa563e7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:57 +0900 Subject: block: bfq: fix bfq_set_next_ioprio_data() For a request that has a priority level equal to or larger than IOPRIO_BE_NR, bfq_set_next_ioprio_data() prints a critical warning but defaults to setting the request new_ioprio field to IOPRIO_BE_NR. This is not consistent with the warning and the allowed values for priority levels. Fix this by setting the request new_ioprio field to IOPRIO_BE_NR - 1, the lowest priority level allowed. Cc: Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210811033702.368488-2-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e4a61eda2d0f..e546a5f4bff9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5296,7 +5296,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) if (bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_BE_NR - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -- cgit From 202bc942c5cd4340d37b06c4e0b8b03f9925d818 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:01 +0900 Subject: block: Introduce IOPRIO_NR_LEVELS The BFQ scheduler and ioprio_check_cap() both assume that the RT priority class (IOPRIO_CLASS_RT) can have up to 8 different priority levels, similarly to the BE class (IOPRIO_CLASS_iBE). This is controlled using the IOPRIO_BE_NR macro , which is badly named as the number of levels also applies to the RT class. Introduce the class independent IOPRIO_NR_LEVELS macro, defined to 8, to make things clear. Keep the old IOPRIO_BE_NR macro definition as an alias for IOPRIO_NR_LEVELS. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-6-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++---- block/bfq-iosched.h | 4 ++-- block/bfq-wf2q.c | 6 +++--- block/ioprio.c | 3 +-- 4 files changed, 10 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e546a5f4bff9..4b434369e411 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2508,7 +2508,7 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd, int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) if (bfqg->async_bfqq[i][j]) bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); if (bfqg->async_idle_bfqq) @@ -5293,10 +5293,10 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) break; } - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR - 1; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); @@ -6825,7 +6825,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 99c2a3cb081e..385e28a843d1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -931,7 +931,7 @@ struct bfq_group { void *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct bfq_entity *my_entity; @@ -948,7 +948,7 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct rb_root rq_pos_tree; diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 7a462df71f68..b74cc0da118e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -505,7 +505,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, */ unsigned short bfq_ioprio_to_weight(int ioprio) { - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; + return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -514,12 +514,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio) * * To preserve as much as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF. */ static unsigned short bfq_weight_to_ioprio(int weight) { return max_t(int, 0, - IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); + IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight); } static void bfq_get_entity(struct bfq_entity *entity) diff --git a/block/ioprio.c b/block/ioprio.c index bee628f9f1b2..ca6b136c5586 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -74,9 +74,8 @@ int ioprio_check_cap(int ioprio) fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; - break; case IOPRIO_CLASS_IDLE: break; -- cgit From e70344c05995a190a56bbd1a23dc2218bcc8c924 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:02 +0900 Subject: block: fix default IO priority handling The default IO priority is the best effort (BE) class with the normal priority level IOPRIO_NORM (4). However, get_task_ioprio() returns IOPRIO_CLASS_NONE/IOPRIO_NORM as the default priority and get_current_ioprio() returns IOPRIO_CLASS_NONE/0. Let's be consistent with the defined default and have both of these functions return the default priority IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM) when the user did not define another default IO priority for the task. In include/uapi/linux/ioprio.h, introduce the IOPRIO_BE_NORM macro as an alias to IOPRIO_NORM to clarify that this default level applies to the BE priotity class. In include/linux/ioprio.h, define the macro IOPRIO_DEFAULT as IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM) and use this new macro when setting a priority to the default. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-7-damien.lemoal@wdc.com [axboe: drop unnecessary lightnvm change] Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/ioprio.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4b434369e411..e92bc0348433 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5411,7 +5411,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, case IOPRIO_CLASS_RT: return &bfqg->async_bfqq[0][ioprio]; case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; + ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: return &bfqg->async_bfqq[1][ioprio]; diff --git a/block/ioprio.c b/block/ioprio.c index ca6b136c5586..0e4ff245f2bf 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -170,7 +170,7 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + ret = IOPRIO_DEFAULT; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; @@ -182,9 +182,9 @@ out: int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) - aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + aprio = IOPRIO_DEFAULT; if (!ioprio_valid(bprio)) - bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + bprio = IOPRIO_DEFAULT; return min(aprio, bprio); } -- cgit From 759e0fd4b67766c96b33a114bba0c7d7521fecd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Aug 2021 11:49:29 +0200 Subject: block: add back the bd_holder_dir reference in bd_link_disk_holder This essentially reverts "block: remove the extra kobject reference in bd_link_disk_holder". That commit dropped the extra reference because the condition in the comment can't be true. But it turns out that comment did not actually describe the problematic situation, so add back the extra reference and document it properly. Fixes: fbd9a39542ec ("block: remove the extra kobject reference in bd_link_disk_holder") Reported-by: Tushar Sugandhi Reviewed-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/holder.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/holder.c b/block/holder.c index 4568cc4f6827..9dc084182337 100644 --- a/block/holder.c +++ b/block/holder.c @@ -106,6 +106,12 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } list_add(&holder->list, &disk->slave_bdevs); + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we need + * to keep our own here to allow for cleanup past that point. + */ + kobject_get(bdev->bd_holder_dir); + out_unlock: mutex_unlock(&disk->open_mutex); return ret; @@ -138,6 +144,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { if (disk->slave_dir) __unlink_disk_holder(bdev, disk); + kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } -- cgit From 4dcc4874deb41a11ece9c6e8858385235463c1ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:05 +0200 Subject: block: cleanup the lockdep handling in *alloc_disk Pass the lockdep name to the low-level __blk_alloc_disk helper and hardcode the name for it given that the number of minors or node_id are not very useful information. While this passes a pointless argument for non-lockdep builds that is not really an issue as disk allocation is a probe time only slow path. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- block/genhd.c | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d2725f94491d..4c56e43e6992 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3133,7 +3133,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -3142,7 +3143,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(0, set->numa_node); + disk = __alloc_disk_node(0, set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); diff --git a/block/genhd.c b/block/genhd.c index 731a46063132..2ad2b25dfc87 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,7 +1254,8 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id) +struct gendisk *__alloc_disk_node(int minors, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1282,6 +1283,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); + lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif @@ -1298,7 +1300,7 @@ out_free_disk: } EXPORT_SYMBOL(__alloc_disk_node); -struct gendisk *__blk_alloc_disk(int node) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -1307,7 +1309,7 @@ struct gendisk *__blk_alloc_disk(int node) if (!q) return NULL; - disk = __alloc_disk_node(0, node); + disk = __alloc_disk_node(0, node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; -- cgit From a58bd7683fcb60ae24c8572f932b48bc65719b7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:07 +0200 Subject: block: remove the minors argument to __alloc_disk_node This was a leftover from the legacy alloc_disk interface. Switch the scsi ULPs and dasd to set ->minors directly like all other drivers and remove the argument. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland [dasd] Link: https://lore.kernel.org/r/20210816131910.615153-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- block/genhd.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c56e43e6992..8ac30c343c06 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,7 +3143,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(0, set->numa_node, lkclass); + disk = __alloc_disk_node(set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); diff --git a/block/genhd.c b/block/genhd.c index 2ad2b25dfc87..caeda726189c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,8 +1254,7 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id, - struct lock_class_key *lkclass) +struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1277,7 +1276,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1309,7 +1307,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) if (!q) return NULL; - disk = __alloc_disk_node(0, node, lkclass); + disk = __alloc_disk_node(node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; -- cgit From 4a1fa41d304c7129328d4d5c7f31715b95e23b29 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:08 +0200 Subject: block: pass a request_queue to __blk_alloc_disk Pass in a request_queue and assign disk->queue in __blk_alloc_disk to ensure struct gendisk always has a valid ->queue pointer. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- block/genhd.c | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8ac30c343c06..2ca7e7c94b18 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,12 +3143,11 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(set->numa_node, lkclass); + disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); diff --git a/block/genhd.c b/block/genhd.c index caeda726189c..f18122ee2778 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,7 +1254,8 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1281,6 +1282,7 @@ struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); + disk->queue = q; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); @@ -1307,12 +1309,11 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) if (!q) return NULL; - disk = __alloc_disk_node(node, lkclass); + disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_alloc_disk); -- cgit From 61a35cfc26334fe1c8e970ca8fafeae2daae257d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:09 +0200 Subject: block: hold a request_queue reference for the lifetime of struct gendisk Acquire the queue ref dropped in disk_release in __blk_alloc_disk so any allocate gendisk always has a queue reference. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index f18122ee2778..6294517cebe6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -551,15 +551,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, register_disk(parent, disk, groups); blk_register_queue(disk); - /* - * Take an extra ref on queue which will be put on disk_release() - * so that it sticks around as long as @disk is there. - */ - if (blk_get_queue(disk->queue)) - set_bit(GD_QUEUE_REF, &disk->state); - else - WARN_ON_ONCE(1); - disk_add_events(disk); blk_integrity_add(disk); } @@ -1087,8 +1078,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); - if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) - blk_put_queue(disk->queue); + blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } @@ -1259,9 +1249,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, { struct gendisk *disk; + if (!blk_get_queue(q)) + return NULL; + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) - return NULL; + goto out_put_queue; disk->bdi = bdi_alloc(node_id); if (!disk->bdi) @@ -1296,6 +1289,8 @@ out_free_bdi: bdi_put(disk->bdi); out_free_disk: kfree(disk); +out_put_queue: + blk_put_queue(q); return NULL; } EXPORT_SYMBOL(__alloc_disk_node); -- cgit From d152c682f03ceb65c0d9663d4ba6ee2d46aa784d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:46:24 +0200 Subject: block: add an explicit ->disk backpointer to the request_queue Replace the magic lookup through the kobject tree with an explicit backpointer, given that the device model links are set up and torn down at times when I/O is still possible, leading to potential NULL or invalid pointer dereferences. Fixes: edb0872f44ec ("block: move the bdi from the request_queue to the gendisk") Reported-by: syzbot Signed-off-by: Christoph Hellwig Tested-by: Sven Schnelle Link: https://lore.kernel.org/r/20210816134624.GA24234@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/blk-cgroup.c | 4 ++-- block/blk-mq.c | 2 +- block/blk-settings.c | 8 ++++---- block/blk-sysfs.c | 13 ++++++------- block/blk-wbt.c | 10 +++++----- block/genhd.c | 2 ++ 7 files changed, 21 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e92bc0348433..480e1a134859 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5269,7 +5269,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(queue_to_disk(bfqq->bfqd->queue)->bdi), + bdi_dev_name(bfqq->bfqd->queue->disk->bdi), ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b8ec47dcce42..f575aa42922b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -489,9 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - if (!queue_has_disk(blkg->q) || !queue_to_disk(blkg->q)->bdi->dev) + if (!blkg->q->disk || !blkg->q->disk->bdi->dev) return NULL; - return bdi_dev_name(queue_to_disk(blkg->q)->bdi); + return bdi_dev_name(blkg->q->disk->bdi); } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 2ca7e7c94b18..0a33d16a7298 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(queue_to_disk(q)->bdi); + laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); diff --git a/block/blk-settings.c b/block/blk-settings.c index 3613d2cc0688..a7c857ad7d10 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -141,9 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - if (!queue_has_disk(q)) + if (!q->disk) return; - queue_to_disk(q)->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); + q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -475,9 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - if (!queue_has_disk(q)) + if (!q->disk) return; - queue_to_disk(q)->bdi->ra_pages = + q->disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 586507a5b8c2..7fd99487300c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -90,9 +90,9 @@ static ssize_t queue_ra_show(struct request_queue *q, char *page) { unsigned long ra_kb; - if (!queue_has_disk(q)) + if (!q->disk) return -EINVAL; - ra_kb = queue_to_disk(q)->bdi->ra_pages << (PAGE_SHIFT - 10); + ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, page); } @@ -102,12 +102,12 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) unsigned long ra_kb; ssize_t ret; - if (!queue_has_disk(q)) + if (!q->disk) return -EINVAL; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - queue_to_disk(q)->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -254,9 +254,8 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - if (queue_has_disk(q)) - queue_to_disk(q)->bdi->io_pages = - max_sectors_kb >> (PAGE_SHIFT - 10); + if (q->disk) + q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 31086afaad9c..874c1c37bf0c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &queue_to_disk(rwb->rqos.q)->bdi->wb; + struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,8 +359,8 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(queue_to_disk(rwb->rqos.q)->bdi, status, - rqd->scale_step, inflight); + trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, + inflight); /* * If we exceeded the latency target, step down. If we did not, diff --git a/block/genhd.c b/block/genhd.c index 6294517cebe6..02cd9ec93e52 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1078,6 +1078,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); + disk->queue->disk = NULL; blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } @@ -1276,6 +1277,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, device_initialize(disk_to_dev(disk)); inc_diskseq(disk); disk->queue = q; + q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); -- cgit From 40b3a52ffc5bc3b5427d5d35b035cfb19d03fdd6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:32 +0200 Subject: block: add a sanity check for a live disk in del_gendisk Add a sanity check to del_gendisk to do nothing when the disk wasn't successfully added. This papers over the complete lack of add_disk error handling, which is about to get fixed gradually. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 02cd9ec93e52..935f74c652f1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -579,7 +579,7 @@ void del_gendisk(struct gendisk *disk) { might_sleep(); - if (WARN_ON_ONCE(!disk->queue)) + if (WARN_ON_ONCE(!disk_live(disk))) return; blk_integrity_del(disk); -- cgit From 52b85909f85d06efa69aaf4210e72467f1f58d2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:33 +0200 Subject: block: fold register_disk into device_add_disk There is no real reason these should be separate. Also simplify the groups assignment a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210818144542.19305-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 131 +++++++++++++++++++++++++++------------------------------- 1 file changed, 60 insertions(+), 71 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 935f74c652f1..ec4be5889fbf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -409,71 +409,6 @@ static void disk_scan_partitions(struct gendisk *disk) blkdev_put(bdev, FMODE_READ); } -static void register_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) -{ - struct device *ddev = disk_to_dev(disk); - int err; - - ddev->parent = parent; - - dev_set_name(ddev, "%s", disk->disk_name); - - /* delay uevents, until we scanned partition table */ - dev_set_uevent_suppress(ddev, 1); - - if (groups) { - WARN_ON(ddev->groups); - ddev->groups = groups; - } - if (device_add(ddev)) - return; - if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } - } - - /* - * avoid probable deadlock caused by allocating memory with - * GFP_KERNEL in runtime_resume callback of its all ancestor - * devices - */ - pm_runtime_set_memalloc_noio(ddev, true); - - disk->part0->bd_holder_dir = - kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - - /* - * XXX: this is a mess, can't wait for real error handling in add_disk. - * Make sure ->slave_dir is NULL if we failed some of the registration - * so that the cleanup in bd_unlink_disk_holder works properly. - */ - if (bd_register_pending_holders(disk) < 0) { - kobject_put(disk->slave_dir); - disk->slave_dir = NULL; - } - - if (disk->flags & GENHD_FL_HIDDEN) - return; - - disk_scan_partitions(disk); - - /* announce the disk and partitions after all partitions are created */ - dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); - - if (disk->bdi->dev) { - err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, - "bdi"); - WARN_ON(err); - } -} - /** * device_add_disk - add disk information to kernel list * @parent: parent device for the disk @@ -490,6 +425,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { + struct device *ddev = disk_to_dev(disk); int ret; /* @@ -538,17 +474,70 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct device *dev = disk_to_dev(disk); - /* Register BDI before referencing it from bdev */ - dev->devt = MKDEV(disk->major, disk->first_minor); + ddev->devt = MKDEV(disk->major, disk->first_minor); ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); WARN_ON(ret); - bdi_set_owner(disk->bdi, dev); - bdev_add(disk->part0, dev->devt); + bdi_set_owner(disk->bdi, ddev); + bdev_add(disk->part0, ddev->devt); + } + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + ddev->parent = parent; + ddev->groups = groups; + dev_set_name(ddev, "%s", disk->disk_name); + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + ret = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (ret) { + device_del(ddev); + return; + } } - register_disk(parent, disk, groups); + + /* + * avoid probable deadlock caused by allocating memory with + * GFP_KERNEL in runtime_resume callback of its all ancestor + * devices + */ + pm_runtime_set_memalloc_noio(ddev, true); + + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + /* + * XXX: this is a mess, can't wait for real error handling in add_disk. + * Make sure ->slave_dir is NULL if we failed some of the registration + * so that the cleanup in bd_unlink_disk_holder works properly. + */ + if (bd_register_pending_holders(disk) < 0) { + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; + } + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + disk_scan_partitions(disk); + + /* + * Announce the disk and partitions after all partitions are + * created. + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); + + if (disk->bdi->dev) { + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + WARN_ON(ret); + } + } + blk_register_queue(disk); disk_add_events(disk); -- cgit From 8235b5c1e8c1c0537f03a21a2e380098bed25248 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:34 +0200 Subject: block: call bdev_add later in device_add_disk Once bdev_add is called userspace can open the block device. Ensure that the struct device, which is used for refcounting of the disk besides various other things, is fully setup at that point. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ec4be5889fbf..ab455f110be2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -466,29 +466,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_alloc_events(disk); - if (disk->flags & GENHD_FL_HIDDEN) { - /* - * Don't let hidden disks show up in /proc/partitions, - * and don't bother scanning for partitions either. - */ - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->flags |= GENHD_FL_NO_PART_SCAN; - } else { - /* Register BDI before referencing it from bdev */ - ddev->devt = MKDEV(disk->major, disk->first_minor); - ret = bdi_register(disk->bdi, "%u:%u", - disk->major, disk->first_minor); - WARN_ON(ret); - bdi_set_owner(disk->bdi, ddev); - bdev_add(disk->part0, ddev->devt); - } - /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); + if (!(disk->flags & GENHD_FL_HIDDEN)) + ddev->devt = MKDEV(disk->major, disk->first_minor); if (device_add(ddev)) return; if (!sysfs_deprecated) { @@ -521,12 +506,25 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->slave_dir = NULL; } - if (!(disk->flags & GENHD_FL_HIDDEN)) { + if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Don't let hidden disks show up in /proc/partitions, + * and don't bother scanning for partitions either. + */ + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_NO_PART_SCAN; + } else { + ret = bdi_register(disk->bdi, "%u:%u", + disk->major, disk->first_minor); + WARN_ON(ret); + bdi_set_owner(disk->bdi, ddev); + bdev_add(disk->part0, ddev->devt); + disk_scan_partitions(disk); /* * Announce the disk and partitions after all partitions are - * created. + * created. (for hidden disks uevents remain suppressed forever) */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); -- cgit From 9d5ee6767c85762205b788ed1245f21fafd6c504 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:35 +0200 Subject: block: create the bdi link earlier in device_add_disk This will simplify error handling going forward. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ab455f110be2..f05e58f214d2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -518,8 +518,13 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->major, disk->first_minor); WARN_ON(ret); bdi_set_owner(disk->bdi, ddev); - bdev_add(disk->part0, ddev->devt); + if (disk->bdi->dev) { + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + WARN_ON(ret); + } + bdev_add(disk->part0, ddev->devt); disk_scan_partitions(disk); /* @@ -528,12 +533,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); - - if (disk->bdi->dev) { - ret = sysfs_create_link(&ddev->kobj, - &disk->bdi->dev->kobj, "bdi"); - WARN_ON(ret); - } } blk_register_queue(disk); -- cgit From bab53f6b617d9f530978d6e3693f88e586d81a8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:36 +0200 Subject: block: call blk_integrity_add earlier in device_add_disk Doing all the sysfs file creation before adding the bdev and thus allowing it to be opened will simplify the about to be added error handling. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-6-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index f05e58f214d2..75d900e4c82f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -492,6 +492,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); + blk_integrity_add(disk); + disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); @@ -538,7 +540,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, blk_register_queue(disk); disk_add_events(disk); - blk_integrity_add(disk); } EXPORT_SYMBOL(device_add_disk); -- cgit From 75f4dca59694dfe288ae6a48d7b147b60d11c95c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:37 +0200 Subject: block: call blk_register_queue earlier in device_add_disk Ensure that all the sysfs bits are set up before bdev_add is called, as that will make the upcomding error handling much easier. However this means the call to disk_update_readahead has to be split as that requires a bdi. Also remove various sanity checks that don't make sense now that blk_register_queue only has a single caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210818144542.19305-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 9 --------- block/genhd.c | 5 +++-- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7fd99487300c..614d9d47de36 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -856,15 +856,6 @@ int blk_register_queue(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - if (WARN_ON(!q)) - return -ENXIO; - - WARN_ONCE(blk_queue_registered(q), - "%s is registering an already registered queue\n", - kobject_name(&dev->kobj)); - - disk_update_readahead(disk); - ret = blk_trace_init_sysfs(dev); if (ret) return ret; diff --git a/block/genhd.c b/block/genhd.c index 75d900e4c82f..a54b4849242c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -508,6 +508,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->slave_dir = NULL; } + blk_register_queue(disk); + if (disk->flags & GENHD_FL_HIDDEN) { /* * Don't let hidden disks show up in /proc/partitions, @@ -537,8 +539,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_uevent(disk, KOBJ_ADD); } - blk_register_queue(disk); - + disk_update_readahead(disk); disk_add_events(disk); } EXPORT_SYMBOL(device_add_disk); -- cgit From 614310c9c8ca15359f4e71a5bbd9165897b4d54e Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:38 +0200 Subject: block: return errors from blk_integrity_add Prepare for proper error handling in add_disk. Signed-off-by: Luis Chamberlain [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 12 +++++++----- block/blk.h | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 410da060d1f5..69a12177dfb6 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -431,13 +431,15 @@ void blk_integrity_unregister(struct gendisk *disk) } EXPORT_SYMBOL(blk_integrity_unregister); -void blk_integrity_add(struct gendisk *disk) +int blk_integrity_add(struct gendisk *disk) { - if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, "%s", "integrity")) - return; + int ret; - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity"); + if (!ret) + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + return ret; } void blk_integrity_del(struct gendisk *disk) diff --git a/block/blk.h b/block/blk.h index 148bdcd3aa08..c9727dd56da7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -132,7 +132,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } -void blk_integrity_add(struct gendisk *); +int blk_integrity_add(struct gendisk *disk); void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, @@ -166,8 +166,9 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } -static inline void blk_integrity_add(struct gendisk *disk) +static inline int blk_integrity_add(struct gendisk *disk) { + return 0; } static inline void blk_integrity_del(struct gendisk *disk) { -- cgit From 92e7755ebc69233e25a2d1b760aeff536dc4016b Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:39 +0200 Subject: block: return errors from disk_alloc_events Prepare for proper error handling in add_disk. Signed-off-by: Luis Chamberlain [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/disk-events.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index c9727dd56da7..bbbcc1a64a2d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -362,7 +362,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); -void disk_alloc_events(struct gendisk *disk); +int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); diff --git a/block/disk-events.c b/block/disk-events.c index 7445b8ff2775..8d5496e7592a 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -444,17 +444,17 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -void disk_alloc_events(struct gendisk *disk) +int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) - return; + return 0; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; + return -ENOMEM; } INIT_LIST_HEAD(&ev->node); @@ -466,6 +466,7 @@ void disk_alloc_events(struct gendisk *disk) INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; + return 0; } void disk_add_events(struct gendisk *disk) -- cgit From 83cbce9574462c6b4eed6797bdaf18fae6859ab3 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:40 +0200 Subject: block: add error handling for device_add_disk / add_disk Properly unwind on errors in device_add_disk. This is the initial work as drivers are not converted yet, which will follow in separate patches. Signed-off-by: Luis Chamberlain [hch: major rebase. All bugs are probably mine] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 92 +++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index a54b4849242c..a925f773145f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -417,11 +417,8 @@ static void disk_scan_partitions(struct gendisk *disk) * * This function registers the partitioning information in @disk * with the kernel. - * - * FIXME: error handling */ - -void device_add_disk(struct device *parent, struct gendisk *disk, +int device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { @@ -444,7 +441,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, * and all partitions from the extended dev_t space. */ if (disk->major) { - WARN_ON(!disk->minors); + if (WARN_ON(!disk->minors)) + return -EINVAL; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -452,19 +450,20 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->minors = DISK_MAX_PARTS; } } else { - WARN_ON(disk->minors); + if (WARN_ON(disk->minors)) + return -EINVAL; ret = blk_alloc_ext_minor(); - if (ret < 0) { - WARN_ON(1); - return; - } + if (ret < 0) + return ret; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = MINOR(ret); disk->flags |= GENHD_FL_EXT_DEVT; } - disk_alloc_events(disk); + ret = disk_alloc_events(disk); + if (ret) + goto out_free_ext_minor; /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); @@ -474,15 +473,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, dev_set_name(ddev, "%s", disk->disk_name); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); - if (device_add(ddev)) - return; + ret = device_add(ddev); + if (ret) + goto out_disk_release_events; if (!sysfs_deprecated) { ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); - if (ret) { - device_del(ddev); - return; - } + if (ret) + goto out_device_del; } /* @@ -492,23 +490,25 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - blk_integrity_add(disk); + ret = blk_integrity_add(disk); + if (ret) + goto out_del_block_link; disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); + if (!disk->part0->bd_holder_dir) + goto out_del_integrity; disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (!disk->slave_dir) + goto out_put_holder_dir; - /* - * XXX: this is a mess, can't wait for real error handling in add_disk. - * Make sure ->slave_dir is NULL if we failed some of the registration - * so that the cleanup in bd_unlink_disk_holder works properly. - */ - if (bd_register_pending_holders(disk) < 0) { - kobject_put(disk->slave_dir); - disk->slave_dir = NULL; - } + ret = bd_register_pending_holders(disk); + if (ret < 0) + goto out_put_slave_dir; - blk_register_queue(disk); + ret = blk_register_queue(disk); + if (ret) + goto out_put_slave_dir; if (disk->flags & GENHD_FL_HIDDEN) { /* @@ -520,13 +520,13 @@ void device_add_disk(struct device *parent, struct gendisk *disk, } else { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); - WARN_ON(ret); + if (ret) + goto out_unregister_queue; bdi_set_owner(disk->bdi, ddev); - if (disk->bdi->dev) { - ret = sysfs_create_link(&ddev->kobj, - &disk->bdi->dev->kobj, "bdi"); - WARN_ON(ret); - } + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + if (ret) + goto out_unregister_bdi; bdev_add(disk->part0, ddev->devt); disk_scan_partitions(disk); @@ -541,6 +541,30 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_update_readahead(disk); disk_add_events(disk); + return 0; + +out_unregister_bdi: + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->bdi); +out_unregister_queue: + blk_unregister_queue(disk); +out_put_slave_dir: + kobject_put(disk->slave_dir); +out_put_holder_dir: + kobject_put(disk->part0->bd_holder_dir); +out_del_integrity: + blk_integrity_del(disk); +out_del_block_link: + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(ddev)); +out_device_del: + device_del(ddev); +out_disk_release_events: + disk_release_events(disk); +out_free_ext_minor: + if (disk->major == BLOCK_EXT_MAJOR) + blk_free_ext_minor(disk->first_minor); + return WARN_ON_ONCE(ret); /* keep until all callers handle errors */ } EXPORT_SYMBOL(device_add_disk); -- cgit From 539711d7d6fe382a73254cc966602e63242a6fb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:52:15 +0200 Subject: block: remove a pointless call to MINOR() in device_add_disk blk_alloc_ext_minor already returns just a minor number, so no need to mask the high bits. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824075216.1179406-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index a925f773145f..f13b7eb0238b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -457,7 +457,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk, if (ret < 0) return ret; disk->major = BLOCK_EXT_MAJOR; - disk->first_minor = MINOR(ret); + disk->first_minor = ret; disk->flags |= GENHD_FL_EXT_DEVT; } -- cgit From c4b2b7d150d2b155b317b3e2f66492c6befab2b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:52:16 +0200 Subject: block: remove CONFIG_DEBUG_BLOCK_EXT_DEVT This might have been a neat debug aid when the extended dev_t was added, but that time is long gone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824075216.1179406-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 43 ++++--------------------------------------- 1 file changed, 4 insertions(+), 39 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index f13b7eb0238b..6a5b65c86c4b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -313,54 +313,19 @@ void unregister_blkdev(unsigned int major, const char *name) EXPORT_SYMBOL(unregister_blkdev); -/** - * blk_mangle_minor - scatter minor numbers apart - * @minor: minor number to mangle - * - * Scatter consecutively allocated @minor number apart if MANGLE_DEVT - * is enabled. Mangling twice gives the original value. - * - * RETURNS: - * Mangled value. - * - * CONTEXT: - * Don't care. - */ -static int blk_mangle_minor(int minor) -{ -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - int i; - - for (i = 0; i < MINORBITS / 2; i++) { - int low = minor & (1 << i); - int high = minor & (1 << (MINORBITS - 1 - i)); - int distance = MINORBITS - 1 - 2 * i; - - minor ^= low | high; /* clear both bits */ - low <<= distance; /* swap the positions */ - high >>= distance; - minor |= low | high; /* and set */ - } -#endif - return minor; -} - int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); - if (idx < 0) { - if (idx == -ENOSPC) - return -EBUSY; - return idx; - } - return blk_mangle_minor(idx); + if (idx == -ENOSPC) + return -EBUSY; + return idx; } void blk_free_ext_minor(unsigned int minor) { - ida_free(&ext_devt_ida, blk_mangle_minor(minor)); + ida_free(&ext_devt_ida, minor); } static char *bdevt_str(dev_t devt, char *buf) -- cgit From d9cf3bd531844ffbfe94b16e417037a16efc988d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 19 Jul 2021 11:53:00 +0100 Subject: bio: fix page leak bio_add_hw_page failure __bio_iov_append_get_pages() doesn't put not appended pages on bio_add_hw_page() failure, so potentially leaking them, fix it. Also, do the same for __bio_iov_iter_get_pages(), even though it looks like it can't be triggered by userspace in this case. Fixes: 0512a75b98f8 ("block: Introduce REQ_OP_ZONE_APPEND") Cc: stable@vger.kernel.org # 5.8+ Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1edfa6a2ffd66d55e6345a477df5387d2c1415d0.1626653825.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 0c89fa2f7a85..265bff6b549a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -974,6 +974,14 @@ static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) return 0; } +static void bio_put_pages(struct page **pages, size_t size, size_t off) +{ + size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); + + for (i = 0; i < nr; i++) + put_page(pages[i]); +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1018,8 +1026,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (same_page) put_page(page); } else { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; + if (WARN_ON_ONCE(bio_full(bio, len))) { + bio_put_pages(pages + i, left, offset); + return -EINVAL; + } __bio_add_page(bio, page, len, offset); } offset = 0; @@ -1064,6 +1074,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_add_hw_page(q, bio, page, len, offset, max_append_sectors, &same_page) != len) { + bio_put_pages(pages + i, left, offset); ret = -EINVAL; break; } -- cgit From 466d9c4904deb25e2e8dcd29d3a998f4e3fa7c17 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:34 +0300 Subject: partitions/efi: Support non-standard GPT location Support looking up GPT at a non-standard location specified by a block device driver. Acked-by: Davidlohr Bueso Reviewed-by: Christoph Hellwig Signed-off-by: Dmitry Osipenko Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-3-digetx@gmail.com Signed-off-by: Jens Axboe --- block/partitions/efi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'block') diff --git a/block/partitions/efi.c b/block/partitions/efi.c index aaa3dc487cb5..7ca5c4c374d4 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -585,6 +585,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; + struct gendisk *disk = state->disk; + const struct block_device_operations *fops = disk->fops; sector_t total_sectors = get_capacity(state->disk); u64 lastlba; @@ -619,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { + sector_t agpt_sector; + int err; + + err = fops->alternative_gpt_sector(disk, &agpt_sector); + if (!err) + good_agpt = is_gpt_valid(state, agpt_sector, + &agpt, &aptes); + } + /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; -- cgit From 9f2869921f2a102e209297d4f742f34b46ed3d36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 16:43:10 +0200 Subject: block: refine the disk_live check in del_gendisk hidden gendisks will never be marked live. Fixes: 40b3a52ffc5b ("block: add a sanity check for a live disk in del_gendisk") Reported-by: Bruno Goncalves Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824144310.1487816-1-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 6a5b65c86c4b..567549a011d1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -556,7 +556,7 @@ void del_gendisk(struct gendisk *disk) { might_sleep(); - if (WARN_ON_ONCE(!disk_live(disk))) + if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; blk_integrity_del(disk); -- cgit From ead3b768bb51259e3a5f2287ff5fc9041eb6f450 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:18 +0000 Subject: blk-zoned: allow zone management send operations without CAP_SYS_ADMIN Zone management send operations (BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE) should be allowed under the same permissions as write(). (write() does not require CAP_SYS_ADMIN). Additionally, other ioctls like BLKSECDISCARD and BLKZEROOUT only check if the fd was successfully opened with FMODE_WRITE. (They do not require CAP_SYS_ADMIN). Currently, zone management send operations require both CAP_SYS_ADMIN and that the fd was successfully opened with FMODE_WRITE. Remove the CAP_SYS_ADMIN requirement, so that zone management send operations match the access control requirement of write(), BLKSECDISCARD and BLKZEROOUT. Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-2-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 86fce751bb17..8a60dbeb44be 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -421,9 +421,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!(mode & FMODE_WRITE)) return -EBADF; -- cgit From 4d643b66089591b4769bcdb6fd1bfeff2fe301b8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:19 +0000 Subject: blk-zoned: allow BLKREPORTZONE without CAP_SYS_ADMIN A user space process should not need the CAP_SYS_ADMIN capability set in order to perform a BLKREPORTZONE ioctl. Getting the zone report is required in order to get the write pointer. Neither read() nor write() requires CAP_SYS_ADMIN, so it is reasonable that a user space process that can read/write from/to the device, also can get the write pointer. (Since e.g. writes have to be at the write pointer.) Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-3-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8a60dbeb44be..1d0c76c18fc5 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -360,9 +360,6 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; -- cgit From cc40b7225151f611ef837f6403cfaeadc7af214a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 24 Aug 2021 22:59:18 -0700 Subject: blk-crypto: fix check for too-large dun_bytes dun_bytes needs to be less than or equal to the IV size of the encryption mode, not just less than or equal to BLK_CRYPTO_MAX_IV_SIZE. Currently this doesn't matter since blk_crypto_init_key() is never actually passed invalid values, but we might as well fix this. Fixes: a892c8d52c02 ("block: Inline encryption support for blk-mq") Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20210825055918.51975-1-ebiggers@kernel.org Signed-off-by: Jens Axboe --- block/blk-crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-crypto.c b/block/blk-crypto.c index c5bdaafffa29..103c2e2d50d6 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -332,7 +332,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, if (mode->keysize == 0) return -EINVAL; - if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) -- cgit From 1e294970fc00f45c1f17fb442c26a7e3fc9789b1 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 25 Aug 2021 14:19:51 +0800 Subject: block, bfq: cleanup the repeated declaration Function 'bfq_entity_to_bfqq' is declared twice, so remove the repeated declaration and blank line. Cc: Paolo Valente Cc: Jens Axboe Signed-off-by: Shaokun Zhang Link: https://lore.kernel.org/r/1629872391-46399-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 385e28a843d1..a73488eec8a4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -955,8 +955,6 @@ struct bfq_group { }; #endif -struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - /* --------------- main algorithm interface ----------------- */ #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -- cgit