From 4a9d6d667f0bafed55a9e9f5ae8bceb3680749d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:10 +0100 Subject: block: don't call into the driver for BLKFLSBUF BLKFLSBUF is entirely contained in the block core, and there is no good reason to give the driver a hook into processing it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 3fbc382eb926..c6d8863f0409 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -369,15 +369,8 @@ static inline int is_unrecognized_ioctl(int ret) static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; - fsync_bdev(bdev); invalidate_bdev(bdev); return 0; -- cgit From e00adcadf3af7a8335026d71ab9f0e0a922191ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:11 +0100 Subject: block: add a new set_read_only method Add a new method to allow for driver-specific processing when setting or clearing the block device read-only state. This allows to replace the cumbersome and error-prone override of the whole ioctl implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index c6d8863f0409..a6fa16b97705 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -389,6 +389,11 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; + if (bdev->bd_disk->fops->set_read_only) { + ret = bdev->bd_disk->fops->set_read_only(bdev, n); + if (ret) + return ret; + } set_device_ro(bdev, n); return 0; } -- cgit From 732e12d805a77f74c907c0a28ece271ef1e81e01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:15 +0100 Subject: block: don't call into the driver for BLKROSET Now that all drivers that want to hook into setting or clearing the read-only flag use the set_read_only method, this code can be removed. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index a6fa16b97705..96cb45447364 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -346,26 +346,6 @@ static int blkdev_pr_clear(struct block_device *bdev, return ops->pr_clear(bdev, c.key); } -/* - * Is it an unrecognized ioctl? The correct returns are either - * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a - * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl - * code before returning. - * - * Confused drivers sometimes return EINVAL, which is wrong. It - * means "I understood the ioctl command, but the parameters to - * it were wrong". - * - * We should aim to just fix the broken drivers, the EINVAL case - * should go away. - */ -static inline int is_unrecognized_ioctl(int ret) -{ - return ret == -EINVAL || - ret == -ENOTTY || - ret == -ENOIOCTLCMD; -} - static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -384,9 +364,6 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; if (bdev->bd_disk->fops->set_read_only) { -- cgit From 98f49b63e84d4ee1a5c327d0b5f4e8699f6c70fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:17 +0100 Subject: block: remove set_device_ro Fold set_device_ro into its only remaining caller. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 7 ------- block/ioctl.c | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9387f050c248..b85db1f2233c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1846,13 +1846,6 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } -void set_device_ro(struct block_device *bdev, int flag) -{ - bdev->bd_part->policy = flag; -} - -EXPORT_SYMBOL(set_device_ro); - void set_disk_ro(struct gendisk *disk, int flag) { struct disk_part_iter piter; diff --git a/block/ioctl.c b/block/ioctl.c index 96cb45447364..04255dc5f3bf 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -371,7 +371,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (ret) return ret; } - set_device_ro(bdev, n); + bdev->bd_part->policy = n; return 0; } -- cgit From a7cb3d2f09c8405aed59d97a7d02cebea43cd3c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:18 +0100 Subject: block: remove __blkdev_driver_ioctl Just open code it in the few callers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 04255dc5f3bf..6b785181344f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -219,23 +219,6 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) } #endif -int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - struct gendisk *disk = bdev->bd_disk; - - if (disk->fops->ioctl) - return disk->fops->ioctl(bdev, mode, cmd, arg); - - return -ENOTTY; -} -/* - * For the record: _GPL here is only because somebody decided to slap it - * on the previous export. Sheer idiocy, since it wasn't copyrightable - * at all and could be open-coded without any exports by anybody who cares. - */ -EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); - #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block @@ -594,10 +577,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); - if (ret == -ENOIOCTLCMD) - return __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (ret != -ENOIOCTLCMD) + return ret; - return ret; + if (!bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ -- cgit From 6b3ba9762f9f9f651873af34481ca20e4a6791e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:24 +0100 Subject: block: cleanup del_gendisk a bit Merge three hidden gendisk checks into one. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index b85db1f2233c..d41176eb1f36 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -895,6 +895,9 @@ void del_gendisk(struct gendisk *disk) might_sleep(); + if (WARN_ON_ONCE(!disk->queue)) + return; + blk_integrity_del(disk); disk_del_events(disk); @@ -917,20 +920,18 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; up_write(&disk->lookup_sem); - if (!(disk->flags & GENHD_FL_HIDDEN)) + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - if (disk->queue) { + /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - if (!(disk->flags & GENHD_FL_HIDDEN)) - bdi_unregister(disk->queue->backing_dev_info); - blk_unregister_queue(disk); - } else { - WARN_ON(1); + bdi_unregister(disk->queue->backing_dev_info); } + blk_unregister_queue(disk); + if (!(disk->flags & GENHD_FL_HIDDEN)) blk_unregister_region(disk_devt(disk), disk->minors); /* -- cgit From 62b508f8b6b1b52843cd90f0b2068ed963f25bd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:25 +0100 Subject: block: open code kobj_map into in block/genhd.c Copy and paste the kobj_map functionality in the block code in preparation for completely rewriting it. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 117 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index d41176eb1f36..667d1d6fd70a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -29,6 +28,16 @@ static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; +struct bdev_map { + struct bdev_map *next; + dev_t dev; + unsigned long range; + struct module *owner; + struct kobject *(*probe)(dev_t, int *, void *); + int (*lock)(dev_t, void *); + void *data; +} *bdev_map[255]; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -520,8 +529,6 @@ void unregister_blkdev(unsigned int major, const char *name) EXPORT_SYMBOL(unregister_blkdev); -static struct kobj_map *bdev_map; - /** * blk_mangle_minor - scatter minor numbers apart * @minor: minor number to mangle @@ -648,16 +655,60 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, struct kobject *(*probe)(dev_t, int *, void *), int (*lock)(dev_t, void *), void *data) { - kobj_map(bdev_map, devt, range, module, probe, lock, data); -} + unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; + unsigned index = MAJOR(devt); + unsigned i; + struct bdev_map *p; + + n = min(n, 255u); + p = kmalloc_array(n, sizeof(struct bdev_map), GFP_KERNEL); + if (p == NULL) + return; + for (i = 0; i < n; i++, p++) { + p->owner = module; + p->probe = probe; + p->lock = lock; + p->dev = devt; + p->range = range; + p->data = data; + } + + mutex_lock(&block_class_lock); + for (i = 0, p -= n; i < n; i++, p++, index++) { + struct bdev_map **s = &bdev_map[index % 255]; + while (*s && (*s)->range < range) + s = &(*s)->next; + p->next = *s; + *s = p; + } + mutex_unlock(&block_class_lock); +} EXPORT_SYMBOL(blk_register_region); void blk_unregister_region(dev_t devt, unsigned long range) { - kobj_unmap(bdev_map, devt, range); -} + unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; + unsigned index = MAJOR(devt); + unsigned i; + struct bdev_map *found = NULL; + mutex_lock(&block_class_lock); + for (i = 0; i < min(n, 255u); i++, index++) { + struct bdev_map **s; + for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { + struct bdev_map *p = *s; + if (p->dev == devt && p->range == range) { + *s = p->next; + if (!found) + found = p; + break; + } + } + } + mutex_unlock(&block_class_lock); + kfree(found); +} EXPORT_SYMBOL(blk_unregister_region); static struct kobject *exact_match(dev_t devt, int *partno, void *data) @@ -979,6 +1030,47 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } +static struct gendisk *lookup_gendisk(dev_t dev, int *partno) +{ + struct kobject *kobj; + struct bdev_map *p; + unsigned long best = ~0UL; + +retry: + mutex_lock(&block_class_lock); + for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { + struct kobject *(*probe)(dev_t, int *, void *); + struct module *owner; + void *data; + + if (p->dev > dev || p->dev + p->range - 1 < dev) + continue; + if (p->range - 1 >= best) + break; + if (!try_module_get(p->owner)) + continue; + owner = p->owner; + data = p->data; + probe = p->probe; + best = p->range - 1; + *partno = dev - p->dev; + if (p->lock && p->lock(dev, data) < 0) { + module_put(owner); + continue; + } + mutex_unlock(&block_class_lock); + kobj = probe(dev, partno, data); + /* Currently ->owner protects _only_ ->probe() itself. */ + module_put(owner); + if (kobj) + return dev_to_disk(kobj_to_dev(kobj)); + goto retry; + } + mutex_unlock(&block_class_lock); + return NULL; +} + + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -996,11 +1088,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) might_sleep(); if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - struct kobject *kobj; - - kobj = kobj_lookup(bdev_map, devt, partno); - if (kobj) - disk = dev_to_disk(kobj_to_dev(kobj)); + disk = lookup_gendisk(devt, partno); } else { struct hd_struct *part; @@ -1213,6 +1301,22 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data) return NULL; } +static void bdev_map_init(void) +{ + struct bdev_map *base; + int i; + + base = kzalloc(sizeof(*base), GFP_KERNEL); + if (!base) + panic("cannot allocate bdev_map"); + + base->dev = 1; + base->range = ~0 ; + base->probe = base_probe; + for (i = 0; i < 255; i++) + bdev_map[i] = base; +} + static int __init genhd_device_init(void) { int error; @@ -1221,7 +1325,7 @@ static int __init genhd_device_init(void) error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map = kobj_map_init(base_probe, &block_class_lock); + bdev_map_init(); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); -- cgit From e49fbbbf0aa14f011ab037086f37f58bd058a6ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:26 +0100 Subject: block: split block_class_lock Split the block_class_lock mutex into one each to protect bdev_map and major_names. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 667d1d6fd70a..8226add353be 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,7 +25,6 @@ #include "blk.h" -static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; struct bdev_map { @@ -37,6 +36,7 @@ struct bdev_map { int (*lock)(dev_t, void *); void *data; } *bdev_map[255]; +static DEFINE_MUTEX(bdev_map_lock); /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -403,6 +403,7 @@ static struct blk_major_name { int major; char name[16]; } *major_names[BLKDEV_MAJOR_HASH_SIZE]; +static DEFINE_MUTEX(major_names_lock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) @@ -415,11 +416,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); } #endif /* CONFIG_PROC_FS */ @@ -448,7 +449,7 @@ int register_blkdev(unsigned int major, const char *name) struct blk_major_name **n, *p; int index, ret = 0; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { @@ -501,7 +502,7 @@ int register_blkdev(unsigned int major, const char *name) kfree(p); } out: - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); return ret; } @@ -513,7 +514,7 @@ void unregister_blkdev(unsigned int major, const char *name) struct blk_major_name *p = NULL; int index = major_to_index(major); - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -523,7 +524,7 @@ void unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); kfree(p); } @@ -674,7 +675,7 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, p->data = data; } - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (i = 0, p -= n; i < n; i++, p++, index++) { struct bdev_map **s = &bdev_map[index % 255]; while (*s && (*s)->range < range) @@ -682,7 +683,7 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, p->next = *s; *s = p; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); } EXPORT_SYMBOL(blk_register_region); @@ -693,7 +694,7 @@ void blk_unregister_region(dev_t devt, unsigned long range) unsigned i; struct bdev_map *found = NULL; - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (i = 0; i < min(n, 255u); i++, index++) { struct bdev_map **s; for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { @@ -706,7 +707,7 @@ void blk_unregister_region(dev_t devt, unsigned long range) } } } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); kfree(found); } EXPORT_SYMBOL(blk_unregister_region); @@ -1037,7 +1038,7 @@ static struct gendisk *lookup_gendisk(dev_t dev, int *partno) unsigned long best = ~0UL; retry: - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { struct kobject *(*probe)(dev_t, int *, void *); struct module *owner; @@ -1058,7 +1059,7 @@ retry: module_put(owner); continue; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); kobj = probe(dev, partno, data); /* Currently ->owner protects _only_ ->probe() itself. */ module_put(owner); @@ -1066,7 +1067,7 @@ retry: return dev_to_disk(kobj_to_dev(kobj)); goto retry; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); return NULL; } -- cgit From bd8eff3ba2caca53ea72cf3cc87a7797771dd7d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:27 +0100 Subject: block: rework requesting modules for unclaimed devices Instead of reusing the ranges in bdev_map, add a new helper that is called if no ranges was found. This is a first step to unpeel and eventually remove the complex ranges structure. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8226add353be..81017bd3b333 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1031,6 +1031,13 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } +static void request_gendisk_module(dev_t devt) +{ + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("block-major-%d", MAJOR(devt)); +} + static struct gendisk *lookup_gendisk(dev_t dev, int *partno) { struct kobject *kobj; @@ -1055,6 +1062,14 @@ retry: probe = p->probe; best = p->range - 1; *partno = dev - p->dev; + + if (!probe) { + mutex_unlock(&bdev_map_lock); + module_put(owner); + request_gendisk_module(dev); + goto retry; + } + if (p->lock && p->lock(dev, data) < 0) { module_put(owner); continue; @@ -1293,15 +1308,6 @@ static const struct seq_operations partitions_op = { }; #endif - -static struct kobject *base_probe(dev_t devt, int *partno, void *data) -{ - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); - return NULL; -} - static void bdev_map_init(void) { struct bdev_map *base; @@ -1313,7 +1319,6 @@ static void bdev_map_init(void) base->dev = 1; base->range = ~0 ; - base->probe = base_probe; for (i = 0; i < 255; i++) bdev_map[i] = base; } -- cgit From a160c6159d4a0cf82f28bc1658a958e278ec3688 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:28 +0100 Subject: block: add an optional probe callback to major_names Add a callback to the major_names array that allows a driver to override how to probe for dev_t that doesn't currently have a gendisk registered. This will help separating the lookup of the gendisk by dev_t vs probe action for a not currently registered dev_t. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 81017bd3b333..20521163fd06 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -402,6 +402,7 @@ static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; + void (*probe)(dev_t devt); } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); @@ -444,7 +445,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. */ -int register_blkdev(unsigned int major, const char *name) +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; @@ -483,6 +485,7 @@ int register_blkdev(unsigned int major, const char *name) } p->major = major; + p->probe = probe; strlcpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); @@ -505,8 +508,7 @@ out: mutex_unlock(&major_names_lock); return ret; } - -EXPORT_SYMBOL(register_blkdev); +EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { @@ -1033,6 +1035,19 @@ static ssize_t disk_badblocks_store(struct device *dev, static void request_gendisk_module(dev_t devt) { + unsigned int major = MAJOR(devt); + struct blk_major_name **n; + + mutex_lock(&major_names_lock); + for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { + if ((*n)->major == major && (*n)->probe) { + (*n)->probe(devt); + mutex_unlock(&major_names_lock); + return; + } + } + mutex_unlock(&major_names_lock); + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) /* Make old-style 2.4 aliases work */ request_module("block-major-%d", MAJOR(devt)); -- cgit From e418de3abcda8b102f737919e830024d1455938f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:41 +0100 Subject: block: switch gendisk lookup to a simple xarray Now that bdev_map is only used for finding gendisks, we can use a simple xarray instead of the regions tracking structure for it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 208 +++++++++++----------------------------------------------- 1 file changed, 37 insertions(+), 171 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 20521163fd06..01d146598fe7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,15 +27,7 @@ static struct kobject *block_depr; -struct bdev_map { - struct bdev_map *next; - dev_t dev; - unsigned long range; - struct module *owner; - struct kobject *(*probe)(dev_t, int *, void *); - int (*lock)(dev_t, void *); - void *data; -} *bdev_map[255]; +static DEFINE_XARRAY(bdev_map); static DEFINE_MUTEX(bdev_map_lock); /* for extended dynamic devt allocation, currently only one major is used */ @@ -649,85 +641,26 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -/* - * Register device numbers dev..(dev+range-1) - * range must be nonzero - * The hash chain is sorted on range, so that subranges can override. - */ -void blk_register_region(dev_t devt, unsigned long range, struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), void *data) -{ - unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; - unsigned index = MAJOR(devt); - unsigned i; - struct bdev_map *p; - - n = min(n, 255u); - p = kmalloc_array(n, sizeof(struct bdev_map), GFP_KERNEL); - if (p == NULL) - return; - - for (i = 0; i < n; i++, p++) { - p->owner = module; - p->probe = probe; - p->lock = lock; - p->dev = devt; - p->range = range; - p->data = data; - } +static void blk_register_region(struct gendisk *disk) +{ + int i; mutex_lock(&bdev_map_lock); - for (i = 0, p -= n; i < n; i++, p++, index++) { - struct bdev_map **s = &bdev_map[index % 255]; - while (*s && (*s)->range < range) - s = &(*s)->next; - p->next = *s; - *s = p; + for (i = 0; i < disk->minors; i++) { + if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL)) + WARN_ON_ONCE(1); } mutex_unlock(&bdev_map_lock); } -EXPORT_SYMBOL(blk_register_region); -void blk_unregister_region(dev_t devt, unsigned long range) +static void blk_unregister_region(struct gendisk *disk) { - unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; - unsigned index = MAJOR(devt); - unsigned i; - struct bdev_map *found = NULL; + int i; mutex_lock(&bdev_map_lock); - for (i = 0; i < min(n, 255u); i++, index++) { - struct bdev_map **s; - for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { - struct bdev_map *p = *s; - if (p->dev == devt && p->range == range) { - *s = p->next; - if (!found) - found = p; - break; - } - } - } + for (i = 0; i < disk->minors; i++) + xa_erase(&bdev_map, disk_devt(disk) + i); mutex_unlock(&bdev_map_lock); - kfree(found); -} -EXPORT_SYMBOL(blk_unregister_region); - -static struct kobject *exact_match(dev_t devt, int *partno, void *data) -{ - struct gendisk *p = data; - - return &disk_to_dev(p)->kobj; -} - -static int exact_lock(dev_t devt, void *data) -{ - struct gendisk *p = data; - - if (!get_disk_and_module(p)) - return -1; - return 0; } static void disk_scan_partitions(struct gendisk *disk) @@ -873,8 +806,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); + blk_register_region(disk); } register_disk(parent, disk, groups); if (register_queue) @@ -987,7 +919,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk_devt(disk), disk->minors); + blk_unregister_region(disk); /* * Remove gendisk pointer from idr so that it cannot be looked up * while RCU period before freeing gendisk is running to prevent @@ -1053,54 +985,22 @@ static void request_gendisk_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -static struct gendisk *lookup_gendisk(dev_t dev, int *partno) +static bool get_disk_and_module(struct gendisk *disk) { - struct kobject *kobj; - struct bdev_map *p; - unsigned long best = ~0UL; - -retry: - mutex_lock(&bdev_map_lock); - for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { - struct kobject *(*probe)(dev_t, int *, void *); - struct module *owner; - void *data; - - if (p->dev > dev || p->dev + p->range - 1 < dev) - continue; - if (p->range - 1 >= best) - break; - if (!try_module_get(p->owner)) - continue; - owner = p->owner; - data = p->data; - probe = p->probe; - best = p->range - 1; - *partno = dev - p->dev; - - if (!probe) { - mutex_unlock(&bdev_map_lock); - module_put(owner); - request_gendisk_module(dev); - goto retry; - } + struct module *owner; - if (p->lock && p->lock(dev, data) < 0) { - module_put(owner); - continue; - } - mutex_unlock(&bdev_map_lock); - kobj = probe(dev, partno, data); - /* Currently ->owner protects _only_ ->probe() itself. */ + if (!disk->fops) + return false; + owner = disk->fops->owner; + if (owner && !try_module_get(owner)) + return false; + if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) { module_put(owner); - if (kobj) - return dev_to_disk(kobj_to_dev(kobj)); - goto retry; + return false; } - mutex_unlock(&bdev_map_lock); - return NULL; -} + return true; +} /** * get_gendisk - get partitioning information for a given device @@ -1119,7 +1019,19 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) might_sleep(); if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - disk = lookup_gendisk(devt, partno); + mutex_lock(&bdev_map_lock); + disk = xa_load(&bdev_map, devt); + if (!disk) { + mutex_unlock(&bdev_map_lock); + request_gendisk_module(devt); + mutex_lock(&bdev_map_lock); + disk = xa_load(&bdev_map, devt); + } + if (disk && !get_disk_and_module(disk)) + disk = NULL; + if (disk) + *partno = devt - disk_devt(disk); + mutex_unlock(&bdev_map_lock); } else { struct hd_struct *part; @@ -1323,21 +1235,6 @@ static const struct seq_operations partitions_op = { }; #endif -static void bdev_map_init(void) -{ - struct bdev_map *base; - int i; - - base = kzalloc(sizeof(*base), GFP_KERNEL); - if (!base) - panic("cannot allocate bdev_map"); - - base->dev = 1; - base->range = ~0 ; - for (i = 0; i < 255; i++) - bdev_map[i] = base; -} - static int __init genhd_device_init(void) { int error; @@ -1346,7 +1243,6 @@ static int __init genhd_device_init(void) error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map_init(); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); @@ -1895,35 +1791,6 @@ out_free_disk: } EXPORT_SYMBOL(__alloc_disk_node); -/** - * get_disk_and_module - increments the gendisk and gendisk fops module refcount - * @disk: the struct gendisk to increment the refcount for - * - * This increments the refcount for the struct gendisk, and the gendisk's - * fops module owner. - * - * Context: Any context. - */ -struct kobject *get_disk_and_module(struct gendisk *disk) -{ - struct module *owner; - struct kobject *kobj; - - if (!disk->fops) - return NULL; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return NULL; - kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); - if (kobj == NULL) { - module_put(owner); - return NULL; - } - return kobj; - -} -EXPORT_SYMBOL(get_disk_and_module); - /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for @@ -1960,7 +1827,6 @@ void put_disk_and_module(struct gendisk *disk) module_put(owner); } } -EXPORT_SYMBOL(put_disk_and_module); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { -- cgit From e2b6b301871719d1db0b1ed7a1ed9e06750c80fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 14 Nov 2020 18:08:21 +0100 Subject: block: fix the kerneldoc comment for __register_blkdev Switch the comment to talk about __register_blkdev instead of register_blkdev and document the new probe parameter. Fixes: 3da1a61e7046 ("block: add an optional probe callback to major_names") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 01d146598fe7..ec2a24799cd9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -418,11 +418,12 @@ void blkdev_show(struct seq_file *seqf, off_t offset) #endif /* CONFIG_PROC_FS */ /** - * register_blkdev - register a new block device + * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string + * @probe: allback that is called on access to any minor number of @major * * The @name must be unique within the system. * @@ -436,6 +437,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. + * + * Use register_blkdev instead for any new code. */ int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)) -- cgit From 449f4ec9892ebc2f37a7eae6d97db2cf7c65e09a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:56 +0100 Subject: block: remove the update_bdev parameter to set_capacity_revalidate_and_notify The update_bdev argument is always set to true, so remove it. Also rename the function to the slighly less verbose set_capacity_and_notify, as propagating the disk size to the block device isn't really revalidation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- block/genhd.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ec2a24799cd9..4e039524f92b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -47,17 +47,15 @@ static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); /* - * Set disk capacity and notify if the size is not currently - * zero and will not be set to zero + * Set disk capacity and notify if the size is not currently zero and will not + * be set to zero. Returns true if a uevent was sent, otherwise false. */ -bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool update_bdev) +bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); set_capacity(disk, size); - if (update_bdev) - revalidate_disk_size(disk, true); + revalidate_disk_size(disk, true); if (capacity != size && capacity != 0 && size != 0) { char *envp[] = { "RESIZE=1", NULL }; @@ -68,8 +66,7 @@ bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, return false; } - -EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); +EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* * Format the device name of the indicated disk into the supplied buffer and -- cgit From 5a20d073ec54a72d9a732fa44bfe14954eb6332f Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Mon, 30 Nov 2020 10:20:52 +0800 Subject: block: wbt: Remove unnecessary invoking of wbt_update_limits in wbt_init It's unnecessary to call wbt_update_limits explicitly within wbt_init, because it will be called in the following function wbt_queue_depth_changed. Signed-off-by: Lei Chen Signed-off-by: Jens Axboe --- block/blk-wbt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index fd410086fe1d..0321ca83e73f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -835,7 +835,6 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; - wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. -- cgit From 3f50b95e0edd22824b2650eb65466bf7060f7488 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:43:52 +0100 Subject: block: remove a superflous check in blkpg_do_ioctl sector_t is now always a u64, so this check is not needed. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/ioctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 6b785181344f..0c09bb7a6ff3 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -35,15 +35,6 @@ static int blkpg_do_ioctl(struct block_device *bdev, start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) < sizeof(long long)) { - long pstart = start, plength = length; - - if (pstart != start || plength != length || pstart < 0 || - plength < 0 || p.pno > 65535) - return -EINVAL; - } - switch (op) { case BLKPG_ADD_PARTITION: /* check if partition is aligned to blocksize */ -- cgit From e79319af6d8cfd7311fef1bfbb1c59c94e6e10a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Nov 2020 06:48:53 +0100 Subject: block: use disk_part_iter_exit in disk_part_iter_next Call disk_part_iter_exit in disk_part_iter_next instead of duplicating the functionality. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 4e039524f92b..0bd9c41dd4cb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -227,8 +227,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) int inc, end; /* put the last partition */ - disk_put_part(piter->part); - piter->part = NULL; + disk_part_iter_exit(piter); /* get part_tbl */ rcu_read_lock(); -- cgit From efdc41c8d49fc1ff9bbef8f68f1cf1d8d59164a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Nov 2020 07:25:37 +0100 Subject: block: use put_device in put_disk Use put_device to put the device instead of poking into the internals and using kobject_put. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 0bd9c41dd4cb..f46e89226fdf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1803,7 +1803,7 @@ EXPORT_SYMBOL(__alloc_disk_node); void put_disk(struct gendisk *disk) { if (disk) - kobject_put(&disk_to_dev(disk)->kobj); + put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); -- cgit From 4e7b5671c6a883d94b5428e1a9c141bbd56cb2a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 13:38:40 +0100 Subject: block: remove i_bdev Switch the block device lookup interfaces to directly work with a dev_t so that struct block_device references are only acquired by the blkdev_get variants (and the blk-cgroup special case). This means that we now don't need an extra reference in the inode and can generally simplify handling of struct block_device to keep the lookups contained in the core block layer code. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Acked-by: Coly Li [bcache] Signed-off-by: Jens Axboe --- block/ioctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 0c09bb7a6ff3..a6d8171221c7 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -590,8 +590,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); - struct inode *inode = file->f_mapping->host; - struct block_device *bdev = inode->i_bdev; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; loff_t size; -- cgit From 22ae8ce8b89241c94ac00c237752c0ffa37ba5ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 09:23:26 +0100 Subject: block: simplify bdev/disk lookup in blkdev_get To simplify block device lookup and a few other upcoming areas, make sure that we always have a struct block_device available for each disk and each partition, and only find existing block devices in bdget. The only downside of this is that each device and partition uses a little more memory. The upside will be that a lot of code can be simplified. With that all we need to look up the block device is to lookup the inode and do a few sanity checks on the gendisk, instead of the separate lookup for the gendisk. For blk-cgroup which wants to access a gendisk without opening it, a new blkdev_{get,put}_no_open low-level interface is added to replace the previous get_gendisk use. Note that the change to look up block device directly instead of the two step lookup using struct gendisk causes a subtile change in behavior: accessing a non-existing partition on an existing block device can now cause a call to request_module. That call is harmless, and in practice no recent system will access these nodes as they aren't created by udev and static /dev/ setups are unusual. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 42 +++++----- block/blk-iocost.c | 36 ++++----- block/blk.h | 2 +- block/genhd.c | 210 ++++++------------------------------------------ block/partitions/core.c | 29 ++++--- 5 files changed, 81 insertions(+), 238 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c68bdf58c9a6..ad02289a4f7f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -556,22 +556,22 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, } /** - * blkg_conf_prep - parse and prepare for per-blkg config update + * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching gendisk. *@inputp is + * from @input and get and return the matching bdev. *@inputp is * updated to point past the device node prefix. Returns an ERR_PTR() * value on error. * * Use this function iff blkg_conf_prep() can't be used for some reason. */ -struct gendisk *blkcg_conf_get_disk(char **inputp) +struct block_device *blkcg_conf_open_bdev(char **inputp) { char *input = *inputp; unsigned int major, minor; - struct gendisk *disk; - int key_len, part; + struct block_device *bdev; + int key_len; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return ERR_PTR(-EINVAL); @@ -581,16 +581,16 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) return ERR_PTR(-EINVAL); input = skip_spaces(input); - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk) + bdev = blkdev_get_no_open(MKDEV(major, minor)); + if (!bdev) return ERR_PTR(-ENODEV); - if (part) { - put_disk_and_module(disk); + if (bdev_is_partition(bdev)) { + blkdev_put_no_open(bdev); return ERR_PTR(-ENODEV); } *inputp = input; - return disk; + return bdev; } /** @@ -607,18 +607,18 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&disk->queue->queue_lock) + __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) { - struct gendisk *disk; + struct block_device *bdev; struct request_queue *q; struct blkcg_gq *blkg; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - q = disk->queue; + q = bdev->bd_disk->queue; rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -689,7 +689,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - ctx->disk = disk; + ctx->bdev = bdev; ctx->blkg = blkg; ctx->body = input; return 0; @@ -700,7 +700,7 @@ fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); fail: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -723,11 +723,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) { - spin_unlock_irq(&ctx->disk->queue->queue_lock); + spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); rcu_read_unlock(); - put_disk_and_module(ctx->disk); + blkdev_put_no_open(ctx->bdev); } EXPORT_SYMBOL_GPL(blkg_conf_finish); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index bbe86d1199dc..8e20fe4bddec 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3120,23 +3120,23 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3231,12 +3231,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } @@ -3287,23 +3287,23 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3356,13 +3356,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } diff --git a/block/blk.h b/block/blk.h index dfab98465db9..c4839abcfa27 100644 --- a/block/blk.h +++ b/block/blk.h @@ -352,7 +352,6 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); int blk_alloc_devt(struct hd_struct *part, dev_t *devt); void blk_free_devt(dev_t devt); -void blk_invalidate_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 @@ -384,6 +383,7 @@ static inline void hd_free_part(struct hd_struct *part) { free_percpu(part->dkstats); kfree(part->info); + bdput(part->bdev); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index f46e89226fdf..bf8fa82f135f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,17 +27,11 @@ static struct kobject *block_depr; -static DEFINE_XARRAY(bdev_map); -static DEFINE_MUTEX(bdev_map_lock); +DECLARE_RWSEM(bdev_lookup_sem); /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) - -/* For extended devt allocation. ext_devt_lock prevents look up - * results from going away underneath its user. - */ -static DEFINE_SPINLOCK(ext_devt_lock); -static DEFINE_IDR(ext_devt_idr); +static DEFINE_IDA(ext_devt_ida); static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); @@ -580,14 +574,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) return 0; } - /* allocate ext devt */ - idr_preload(GFP_KERNEL); - - spin_lock_bh(&ext_devt_lock); - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); - spin_unlock_bh(&ext_devt_lock); - - idr_preload_end(); + idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); if (idx < 0) return idx == -ENOSPC ? -EBUSY : idx; @@ -606,26 +593,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) */ void blk_free_devt(dev_t devt) { - if (devt == MKDEV(0, 0)) - return; - - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } -} - -/* - * We invalidate devt by assigning NULL pointer for devt in idr. - */ -void blk_invalidate_devt(dev_t devt) -{ - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } + if (MAJOR(devt) == BLOCK_EXT_MAJOR) + ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt))); } static char *bdevt_str(dev_t devt, char *buf) @@ -640,28 +609,6 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -static void blk_register_region(struct gendisk *disk) -{ - int i; - - mutex_lock(&bdev_map_lock); - for (i = 0; i < disk->minors; i++) { - if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL)) - WARN_ON_ONCE(1); - } - mutex_unlock(&bdev_map_lock); -} - -static void blk_unregister_region(struct gendisk *disk) -{ - int i; - - mutex_lock(&bdev_map_lock); - for (i = 0; i < disk->minors; i++) - xa_erase(&bdev_map, disk_devt(disk) + i); - mutex_unlock(&bdev_map_lock); -} - static void disk_scan_partitions(struct gendisk *disk) { struct block_device *bdev; @@ -805,7 +752,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - blk_register_region(disk); + bdev_add(disk->part0.bdev, devt); } register_disk(parent, disk, groups); if (register_queue) @@ -847,8 +794,8 @@ static void invalidate_partition(struct gendisk *disk, int partno) __invalidate_device(bdev, true); /* - * Unhash the bdev inode for this device so that it gets evicted as soon - * as last inode reference is dropped. + * Unhash the bdev inode for this device so that it can't be looked + * up any more even if openers still hold references to it. */ remove_inode_hash(bdev->bd_inode); bdput(bdev); @@ -890,7 +837,8 @@ void del_gendisk(struct gendisk *disk) * Block lookups of the disk until all bdevs are unhashed and the * disk is marked as dead (GENHD_FL_UP cleared). */ - down_write(&disk->lookup_sem); + down_write(&bdev_lookup_sem); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); @@ -903,7 +851,7 @@ void del_gendisk(struct gendisk *disk) invalidate_partition(disk, 0); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; - up_write(&disk->lookup_sem); + up_write(&bdev_lookup_sem); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -916,16 +864,6 @@ void del_gendisk(struct gendisk *disk) } blk_unregister_queue(disk); - - if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk); - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(disk_devt(disk)); kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); @@ -964,7 +902,7 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } -static void request_gendisk_module(dev_t devt) +void blk_request_module(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; @@ -984,84 +922,6 @@ static void request_gendisk_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -static bool get_disk_and_module(struct gendisk *disk) -{ - struct module *owner; - - if (!disk->fops) - return false; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return false; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) { - module_put(owner); - return false; - } - return true; - -} - -/** - * get_gendisk - get partitioning information for a given device - * @devt: device to get partitioning information for - * @partno: returned partition index - * - * This function gets the structure containing partitioning - * information for the given device @devt. - * - * Context: can sleep - */ -struct gendisk *get_gendisk(dev_t devt, int *partno) -{ - struct gendisk *disk = NULL; - - might_sleep(); - - if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - mutex_lock(&bdev_map_lock); - disk = xa_load(&bdev_map, devt); - if (!disk) { - mutex_unlock(&bdev_map_lock); - request_gendisk_module(devt); - mutex_lock(&bdev_map_lock); - disk = xa_load(&bdev_map, devt); - } - if (disk && !get_disk_and_module(disk)) - disk = NULL; - if (disk) - *partno = devt - disk_devt(disk); - mutex_unlock(&bdev_map_lock); - } else { - struct hd_struct *part; - - spin_lock_bh(&ext_devt_lock); - part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk_and_module(part_to_disk(part))) { - *partno = part->partno; - disk = part_to_disk(part); - } - spin_unlock_bh(&ext_devt_lock); - } - - if (!disk) - return NULL; - - /* - * Synchronize with del_gendisk() to not return disk that is being - * destroyed. - */ - down_read(&disk->lookup_sem); - if (unlikely((disk->flags & GENHD_FL_HIDDEN) || - !(disk->flags & GENHD_FL_UP))) { - up_read(&disk->lookup_sem); - put_disk_and_module(disk); - disk = NULL; - } else { - up_read(&disk->lookup_sem); - } - return disk; -} - /** * bdget_disk - do bdget() by gendisk and partition number * @disk: gendisk of interest @@ -1559,11 +1419,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) * * This function releases all allocated resources of the gendisk. * - * The struct gendisk refcount is incremented with get_gendisk() or - * get_disk_and_module(), and its refcount is decremented with - * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this - * function is called. - * * Drivers which used __device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the @@ -1748,16 +1603,17 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; + disk->part0.bdev = bdev_alloc(disk, 0); + if (!disk->part0.bdev) + goto out_free_disk; + disk->part0.dkstats = alloc_percpu(struct disk_stats); if (!disk->part0.dkstats) - goto out_free_disk; + goto out_bdput; - init_rwsem(&disk->lookup_sem); disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_percpu(disk->part0.dkstats); - goto out_free_disk; - } + if (disk_expand_part_tbl(disk, 0)) + goto out_free_bdstats; ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); @@ -1773,7 +1629,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) */ hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) - goto out_free_part0; + goto out_free_bdstats; disk->minors = minors; rand_initialize_disk(disk); @@ -1782,8 +1638,10 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_free_part0: - hd_free_part(&disk->part0); +out_free_bdstats: + free_percpu(disk->part0.dkstats); +out_bdput: + bdput(disk->part0.bdev); out_free_disk: kfree(disk); return NULL; @@ -1807,26 +1665,6 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); -/** - * put_disk_and_module - decrements the module and gendisk refcount - * @disk: the struct gendisk to decrement the refcount for - * - * This is a counterpart of get_disk_and_module() and thus also of - * get_gendisk(). - * - * Context: Any context, but the last reference must not be dropped from - * atomic context. - */ -void put_disk_and_module(struct gendisk *disk) -{ - if (disk) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - } -} - static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; diff --git a/block/partitions/core.c b/block/partitions/core.c index a02e22411594..696bd9ff63c6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -340,12 +340,11 @@ void delete_partition(struct hd_struct *part) device_del(part_to_dev(part)); /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. + * Remove the block device from the inode hash, so that it cannot be + * looked up any more even when openers still hold references. */ - blk_invalidate_devt(part_devt(part)); + remove_inode_hash(part->bdev->bd_inode); + percpu_ref_kill(&part->ref); } @@ -368,6 +367,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; + struct block_device *bdev; struct disk_part_tbl *ptbl; const char *dname; int err; @@ -402,11 +402,15 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); + err = -ENOMEM; p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) { - err = -ENOMEM; + if (!p->dkstats) goto out_free; - } + + bdev = bdev_alloc(disk, partno); + if (!bdev) + goto out_free_stats; + p->bdev = bdev; hd_sects_seq_init(p); pdev = part_to_dev(p); @@ -420,10 +424,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, struct partition_meta_info *pinfo; pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) { - err = -ENOMEM; - goto out_free_stats; - } + if (!pinfo) + goto out_bdput; memcpy(pinfo, info, sizeof(*info)); p->info = pinfo; } @@ -470,6 +472,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ + bdev_add(bdev, devt); rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk suppresses it */ @@ -479,6 +482,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_free_info: kfree(p->info); +out_bdput: + bdput(bdev); out_free_stats: free_percpu(p->dkstats); out_free: -- cgit From a782483cc1f875355690625d8253a232f2581418 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 18:43:37 +0100 Subject: block: remove the nr_sects field in struct hd_struct Now that the hd_struct always has a block device attached to it, there is no need for having two size field that just get out of sync. Additionally the field in hd_struct did not use proper serialization, possibly allowing for torn writes. By only using the block_device field this problem also gets fixed. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Coly Li [bcache] Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- block/blk-core.c | 2 +- block/blk.h | 53 -------------------------------------------- block/genhd.c | 59 +++++++++++++++++++++++++++++-------------------- block/partitions/core.c | 17 +++++++++----- 5 files changed, 49 insertions(+), 86 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index fa01bef35bb1..669bb47a3198 100644 --- a/block/bio.c +++ b/block/bio.c @@ -613,8 +613,8 @@ void guard_bio_eod(struct bio *bio) rcu_read_lock(); part = __disk_get_part(bio->bi_disk, bio->bi_partno); if (part) - maxsector = part_nr_sects_read(part); - else + maxsector = bdev_nr_sectors(part->bdev); + else maxsector = get_capacity(bio->bi_disk); rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 2db8bda43b6e..988f45094a38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -755,7 +755,7 @@ static inline int blk_partition_remap(struct bio *bio) goto out; if (bio_sectors(bio)) { - if (bio_check_eod(bio, part_nr_sects_read(p))) + if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; bio->bi_iter.bi_sector += p->start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), diff --git a/block/blk.h b/block/blk.h index c4839abcfa27..09cee7024fb4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -387,59 +387,6 @@ static inline void hd_free_part(struct hd_struct *part) percpu_ref_exit(&part->ref); } -/* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. - * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. - */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif -} - -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - preempt_disable(); - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); - preempt_enable(); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif -} - int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); diff --git a/block/genhd.c b/block/genhd.c index bf8fa82f135f..c65f485b9db5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -40,6 +40,16 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +void set_capacity(struct gendisk *disk, sector_t sectors) +{ + struct block_device *bdev = disk->part0.bdev; + + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} +EXPORT_SYMBOL(set_capacity); + /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. @@ -47,18 +57,30 @@ static void disk_release_events(struct gendisk *disk); bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); + char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); - revalidate_disk_size(disk, true); - if (capacity != size && capacity != 0 && size != 0) { - char *envp[] = { "RESIZE=1", NULL }; + /* + * Only print a message and send a uevent if the gendisk is user visible + * and alive. This avoids spamming the log and udev when setting the + * initial capacity during probing. + */ + if (size == capacity || + (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + return false; - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); - return true; - } + pr_info("%s: detected capacity change from %lld to %lld\n", + disk->disk_name, size, capacity); - return false; + /* + * Historically we did not send a uevent for changes to/from an empty + * device. + */ + if (!capacity || !size) + return false; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); @@ -247,7 +269,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!part_nr_sects_read(part) && + if (!bdev_nr_sectors(part->bdev) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) @@ -284,7 +306,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part_nr_sects_read(part); + sector < part->start_sect + bdev_nr_sectors(part->bdev); } /** @@ -986,8 +1008,8 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part_nr_sects_read(part) >> 1 - , disk_name(disk, part->partno, name_buf), + bdev_nr_sectors(part->bdev) >> 1, + disk_name(disk, part->partno, name_buf), part->info ? part->info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) @@ -1079,7 +1101,7 @@ static int show_partition(struct seq_file *seqf, void *v) while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part_nr_sects_read(part) >> 1, + bdev_nr_sectors(part->bdev) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -1161,8 +1183,7 @@ ssize_t part_size_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", - (unsigned long long)part_nr_sects_read(p)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); } ssize_t part_stat_show(struct device *dev, @@ -1618,16 +1639,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) goto out_free_bdstats; diff --git a/block/partitions/core.c b/block/partitions/core.c index 696bd9ff63c6..bcfa8215bd5e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -85,6 +85,13 @@ static int (*check_part[])(struct parsed_partitions *) = { NULL }; +static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) +{ + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} + static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; @@ -295,7 +302,7 @@ static void hd_struct_free_work(struct work_struct *work) put_device(disk_to_dev(disk)); part->start_sect = 0; - part->nr_sects = 0; + bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); } @@ -412,11 +419,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_free_stats; p->bdev = bdev; - hd_sects_seq_init(p); pdev = part_to_dev(p); p->start_sect = start; - p->nr_sects = len; + bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -509,7 +515,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + part->nr_sects || + start >= part->start_sect + bdev_nr_sectors(part->bdev) || start + length <= part->start_sect) continue; overlap = true; @@ -600,8 +606,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, length); - bd_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(bdevp, length); ret = 0; out_unlock: -- cgit From 15e3d2c5cd53298272e59ad9072d3468f9dd3781 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:34:00 +0100 Subject: block: move disk stat accounting to struct block_device Move the dkstats and stamp field to struct block_device in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 4 ++-- block/blk.h | 1 - block/genhd.c | 14 ++++---------- block/partitions/core.c | 9 +-------- 5 files changed, 8 insertions(+), 22 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ad02289a4f7f..79aa96240cec 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void) for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += diff --git a/block/blk-core.c b/block/blk-core.c index 988f45094a38..d2c9cb24e087 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1264,9 +1264,9 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->stamp); + stamp = READ_ONCE(part->bdev->bd_stamp); if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) + if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { diff --git a/block/blk.h b/block/blk.h index 09cee7024fb4..3f801f6e86f8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - free_percpu(part->dkstats); kfree(part->info); bdput(part->bdev); percpu_ref_exit(&part->ref); diff --git a/block/genhd.c b/block/genhd.c index c65f485b9db5..2cbda8139556 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -112,7 +112,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -891,7 +891,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; + disk->part0.bdev->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -1628,19 +1628,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk->part0.bdev) goto out_free_disk; - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) - goto out_bdput; - disk->node_id = node_id; if (disk_expand_part_tbl(disk, 0)) - goto out_free_bdstats; + goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); if (hd_ref_init(&disk->part0)) - goto out_free_bdstats; + goto out_bdput; disk->minors = minors; rand_initialize_disk(disk); @@ -1649,8 +1645,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_free_bdstats: - free_percpu(disk->part0.dkstats); out_bdput: bdput(disk->part0.bdev); out_free_disk: diff --git a/block/partitions/core.c b/block/partitions/core.c index bcfa8215bd5e..8924e1ea8b2a 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -409,14 +409,9 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); - err = -ENOMEM; - p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) - goto out_free; - bdev = bdev_alloc(disk, partno); if (!bdev) - goto out_free_stats; + goto out_free; p->bdev = bdev; pdev = part_to_dev(p); @@ -490,8 +485,6 @@ out_free_info: kfree(p->info); out_bdput: bdput(bdev); -out_free_stats: - free_percpu(p->dkstats); out_free: kfree(p); return ERR_PTR(err); -- cgit From 29ff57c61094e7bbd921ab10b5a99dce9a0132e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:34:24 +0100 Subject: block: move the start_sect field to struct block_device Move the start_sect field to struct block_device in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- block/blk-lib.c | 2 +- block/genhd.c | 4 ++-- block/partitions/core.c | 17 +++++++++-------- 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d2c9cb24e087..9a3793d5ce38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -757,9 +757,10 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_sectors(bio)) { if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; - bio->bi_iter.bi_sector += p->start_sect; + bio->bi_iter.bi_sector += p->bdev->bd_start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); + bio->bi_iter.bi_sector - + p->bdev->bd_start_sect); } bio->bi_partno = 0; ret = 0; diff --git a/block/blk-lib.c b/block/blk-lib.c index e90614fd8d6a..752f9c722062 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -65,7 +65,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* In case the discard request is in a partition */ if (bdev_is_partition(bdev)) - part_offset = bdev->bd_part->start_sect; + part_offset = bdev->bd_start_sect; while (nr_sects) { sector_t granularity_aligned_lba, req_sects; diff --git a/block/genhd.c b/block/genhd.c index 2cbda8139556..5efb2df1f079 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -305,8 +305,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { - return part->start_sect <= sector && - sector < part->start_sect + bdev_nr_sectors(part->bdev); + return part->bdev->bd_start_sect <= sector && + sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev); } /** diff --git a/block/partitions/core.c b/block/partitions/core.c index 8924e1ea8b2a..460a745812c6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -192,7 +192,7 @@ static ssize_t part_start_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); + return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, @@ -209,7 +209,7 @@ static ssize_t part_alignment_offset_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, @@ -219,7 +219,7 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -301,7 +301,7 @@ static void hd_struct_free_work(struct work_struct *work) */ put_device(disk_to_dev(disk)); - part->start_sect = 0; + part->bdev->bd_start_sect = 0; bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); @@ -416,7 +416,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); - p->start_sect = start; + bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -508,8 +508,9 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + bdev_nr_sectors(part->bdev) || - start + length <= part->start_sect) + start >= part->bdev->bd_start_sect + + bdev_nr_sectors(part->bdev) || + start + length <= part->bdev->bd_start_sect) continue; overlap = true; break; @@ -592,7 +593,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, mutex_lock_nested(&bdev->bd_mutex, 1); ret = -EINVAL; - if (start != part->start_sect) + if (start != part->bdev->bd_start_sect) goto out_unlock; ret = -EBUSY; -- cgit From 231926dbf0f084211e4ec4f4c006f0bf1f47809a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 12:01:45 +0100 Subject: block: move the partition_meta_info to struct block_device Move the partition_meta_info to struct block_device in preparation for killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 1 - block/genhd.c | 3 ++- block/partitions/core.c | 18 +++++++----------- 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 3f801f6e86f8..0bd4b58bcbaf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - kfree(part->info); bdput(part->bdev); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index 5efb2df1f079..4273e89f07e8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1010,7 +1010,8 @@ void __init printk_all_partitions(void) bdevt_str(part_devt(part), devt_buf), bdev_nr_sectors(part->bdev) >> 1, disk_name(disk, part->partno, name_buf), - part->info ? part->info->uuid : ""); + part->bdev->bd_meta_info ? + part->bdev->bd_meta_info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) printk(" driver: %s\n", diff --git a/block/partitions/core.c b/block/partitions/core.c index 460a745812c6..07df9ff55462 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -275,8 +275,9 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env) struct hd_struct *part = dev_to_part(dev); add_uevent_var(env, "PARTN=%u", part->partno); - if (part->info && part->info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", part->info->volname); + if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", + part->bdev->bd_meta_info->volname); return 0; } @@ -422,13 +423,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, p->policy = get_disk_ro(disk); if (info) { - struct partition_meta_info *pinfo; - - pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) goto out_bdput; - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; } dname = dev_name(ddev); @@ -444,7 +442,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, err = blk_alloc_devt(p, &devt); if (err) - goto out_free_info; + goto out_bdput; pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ @@ -481,8 +479,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return p; -out_free_info: - kfree(p->info); out_bdput: bdput(bdev); out_free: -- cgit From 1bdd5ae0251d678488dffcf455d4633c2beef1bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 19:00:13 +0100 Subject: block: move holder_dir to struct block_device Move the holder_dir field to struct block_device in preparation for kill struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 5 +++-- block/partitions/core.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 4273e89f07e8..0bd7026cee62 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -681,7 +681,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->part0.bdev->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (disk->flags & GENHD_FL_HIDDEN) { @@ -887,7 +888,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); - kobject_put(disk->part0.holder_dir); + kobject_put(disk->part0.bdev->bd_holder_dir); kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); diff --git a/block/partitions/core.c b/block/partitions/core.c index 07df9ff55462..c068471fa654 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -344,7 +344,7 @@ void delete_partition(struct hd_struct *part) */ get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); - kobject_put(part->holder_dir); + kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); /* @@ -452,8 +452,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_put; err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) + bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); @@ -487,7 +487,7 @@ out_free: out_remove_file: device_remove_file(pdev, &dev_attr_whole_disk); out_del: - kobject_put(p->holder_dir); + kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); -- cgit From b309e9936347232c724eaa13f70533128b4864e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 16:28:47 +0100 Subject: block: move make_it_fail to struct block_device Move the make_it_fail flag to struct block_device an turn it into a bool in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 3 ++- block/genhd.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 9a3793d5ce38..9121390be97a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -668,7 +668,8 @@ __setup("fail_make_request=", setup_fail_make_request); static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { - return part->make_it_fail && should_fail(&fail_make_request, bytes); + return part->bdev->bd_make_it_fail && + should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) diff --git a/block/genhd.c b/block/genhd.c index 0bd7026cee62..f9c957739d4b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1292,7 +1292,7 @@ ssize_t part_fail_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->make_it_fail); + return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, @@ -1303,7 +1303,7 @@ ssize_t part_fail_store(struct device *dev, int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; + p->bdev->bd_make_it_fail = i; return count; } -- cgit From 83950d359010a493462d58c712b1124c877d1b3b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 16:36:02 +0100 Subject: block: move the policy field to struct block_device Move the policy field to struct block_device and rename it to the more descriptive bd_read_only. Also turn the field into a bool as it is used as such. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/genhd.c | 8 ++++---- block/ioctl.c | 2 +- block/partitions/core.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 9121390be97a..d64ffcb6f9ae 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -696,7 +696,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { const int op = bio_op(bio); - if (part->policy && op_is_write(op)) { + if (part->bdev->bd_read_only && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) diff --git a/block/genhd.c b/block/genhd.c index f9c957739d4b..2db1204920a9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1687,14 +1687,14 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - if (disk->part0.policy != flag) { + if (disk->part0.bdev->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); - disk->part0.policy = flag; + disk->part0.bdev->bd_read_only = flag; } disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - part->policy = flag; + part->bdev->bd_read_only = flag; disk_part_iter_exit(&piter); } @@ -1704,7 +1704,7 @@ int bdev_read_only(struct block_device *bdev) { if (!bdev) return 0; - return bdev->bd_part->policy; + return bdev->bd_read_only; } EXPORT_SYMBOL(bdev_read_only); diff --git a/block/ioctl.c b/block/ioctl.c index a6d8171221c7..d61d652078f4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -345,7 +345,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (ret) return ret; } - bdev->bd_part->policy = n; + bdev->bd_read_only = n; return 0; } diff --git a/block/partitions/core.c b/block/partitions/core.c index c068471fa654..060c1be13cd8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -199,7 +199,7 @@ static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); + return sprintf(buf, "%d\n", p->bdev->bd_read_only); } static ssize_t part_alignment_offset_show(struct device *dev, @@ -420,7 +420,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); p->partno = partno; - p->policy = get_disk_ro(disk); + bdev->bd_read_only = get_disk_ro(disk); if (info) { err = -ENOMEM; -- cgit From cb8432d650fe3be58bb962bc8e602dc405510327 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 18:47:17 +0100 Subject: block: allocate struct hd_struct as part of struct bdev_inode Allocate hd_struct together with struct block_device to pre-load the lifetime rule changes in preparation of merging the two structures. Note that part0 was previously embedded into struct gendisk, but is a separate allocation now, and already points to the block_device instead of the hd_struct. The lifetime of struct gendisk is still controlled by the struct device embedded in the part0 hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 16 ++++++------ block/blk-flush.c | 2 +- block/blk-merge.c | 2 -- block/blk.h | 21 ---------------- block/genhd.c | 50 ++++++++++++++---------------------- block/partitions/core.c | 67 +++++-------------------------------------------- 6 files changed, 33 insertions(+), 125 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d64ffcb6f9ae..9ea70275fc1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -714,7 +714,8 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bio->bi_disk->part0->bd_part, + bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -831,7 +832,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; @@ -1203,7 +1204,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return ret; if (rq->rq_disk && - should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) + should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -1272,7 +1273,7 @@ again: __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { - part = &part_to_disk(part)->part0; + part = part_to_disk(part)->part0->bd_part; goto again; } } @@ -1309,8 +1310,6 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); - - hd_struct_put(part); } } @@ -1354,7 +1353,7 @@ EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(&disk->part0, sectors, op); + return __part_start_io_acct(disk->part0->bd_part, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); @@ -1376,14 +1375,13 @@ void part_end_io_acct(struct hd_struct *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); - hd_struct_put(part); } EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(&disk->part0, op, start_time); + __part_end_io_acct(disk->part0->bd_part, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); diff --git a/block/blk-flush.c b/block/blk-flush.c index e32958f0b687..fcd0a60574df 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = &rq->rq_disk->part0; + struct hd_struct *part = rq->rq_disk->part0->bd_part; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); diff --git a/block/blk-merge.c b/block/blk-merge.c index bcf5e4580603..cb351ab9b77d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -683,8 +683,6 @@ static void blk_account_io_merge_request(struct request *req) part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); part_stat_unlock(); - - hd_struct_put(req->part); } } diff --git a/block/blk.h b/block/blk.h index 0bd4b58bcbaf..32ac41f7557f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -363,27 +363,6 @@ int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int disk_expand_part_tbl(struct gendisk *disk, int target); -int hd_ref_init(struct hd_struct *part); - -/* no need to get/put refcount of part0 */ -static inline int hd_struct_try_get(struct hd_struct *part) -{ - if (part->partno) - return percpu_ref_tryget_live(&part->ref); - return 1; -} - -static inline void hd_struct_put(struct hd_struct *part) -{ - if (part->partno) - percpu_ref_put(&part->ref); -} - -static inline void hd_free_part(struct hd_struct *part) -{ - bdput(part->bdev); - percpu_ref_exit(&part->ref); -} int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/genhd.c b/block/genhd.c index 2db1204920a9..c35b03dac5e5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -42,7 +42,7 @@ static void disk_release_events(struct gendisk *disk); void set_capacity(struct gendisk *disk, sector_t sectors) { - struct block_device *bdev = disk->part0.bdev; + struct block_device *bdev = disk->part0; spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); @@ -318,9 +318,7 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector) * primarily used for stats accounting. * * CONTEXT: - * RCU read locked. The returned partition pointer is always valid - * because its refcount is grabbed except for part0, which lifetime - * is same with the disk. + * RCU read locked. * * RETURNS: * Found partition on success, part0 is returned if no partition matches @@ -336,26 +334,19 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) ptbl = rcu_dereference(disk->part_tbl); part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) + if (part && sector_in_part(part, sector)) goto out_unlock; for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); if (part && sector_in_part(part, sector)) { - /* - * only live partition can be cached for lookup, - * so use-after-free on cached & deleting partition - * can be avoided - */ - if (!hd_struct_try_get(part)) - break; rcu_assign_pointer(ptbl->last_lookup, part); goto out_unlock; } } - part = &disk->part0; + part = disk->part0->bd_part; out_unlock: rcu_read_unlock(); return part; @@ -681,8 +672,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.bdev->bd_holder_dir = - kobject_create_and_add("holders", &ddev->kobj); + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (disk->flags & GENHD_FL_HIDDEN) { @@ -748,7 +739,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(&disk->part0, &devt); + retval = blk_alloc_devt(disk->part0->bd_part, &devt); if (retval) { WARN_ON(1); return; @@ -775,7 +766,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - bdev_add(disk->part0.bdev, devt); + bdev_add(disk->part0, devt); } register_disk(parent, disk, groups); if (register_queue) @@ -888,11 +879,11 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); - kobject_put(disk->part0.bdev->bd_holder_dir); + kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); - part_stat_set_all(&disk->part0, 0); - disk->part0.bdev->bd_stamp = 0; + part_stat_set_all(disk->part0->bd_part, 0); + disk->part0->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -1005,7 +996,7 @@ void __init printk_all_partitions(void) */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == &disk->part0; + bool is_part0 = part == disk->part0->bd_part; printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), @@ -1460,7 +1451,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); - hd_free_part(&disk->part0); + bdput(disk->part0); if (disk->queue) blk_put_queue(disk->queue); kfree(disk); @@ -1626,8 +1617,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; - disk->part0.bdev = bdev_alloc(disk, 0); - if (!disk->part0.bdev) + disk->part0 = bdev_alloc(disk, 0); + if (!disk->part0) goto out_free_disk; disk->node_id = node_id; @@ -1635,10 +1626,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); - - if (hd_ref_init(&disk->part0)) - goto out_bdput; + rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part); disk->minors = minors; rand_initialize_disk(disk); @@ -1648,7 +1636,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) return disk; out_bdput: - bdput(disk->part0.bdev); + bdput(disk->part0); out_free_disk: kfree(disk); return NULL; @@ -1687,9 +1675,9 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - if (disk->part0.bdev->bd_read_only != flag) { + if (disk->part0->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); - disk->part0.bdev->bd_read_only = flag; + disk->part0->bd_read_only = flag; } disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); diff --git a/block/partitions/core.c b/block/partitions/core.c index 060c1be13cd8..6d1fca193cbd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -265,9 +265,9 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev->devt); - hd_free_part(p); - kfree(p); + bdput(p->bdev); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) @@ -288,46 +288,6 @@ struct device_type part_type = { .uevent = part_uevent, }; -static void hd_struct_free_work(struct work_struct *work) -{ - struct hd_struct *part = - container_of(to_rcu_work(work), struct hd_struct, rcu_work); - struct gendisk *disk = part_to_disk(part); - - /* - * Release the disk reference acquired in delete_partition here. - * We can't release it in hd_struct_free because the final put_device - * needs process context and thus can't be run directly from a - * percpu_ref ->release handler. - */ - put_device(disk_to_dev(disk)); - - part->bdev->bd_start_sect = 0; - bdev_set_nr_sectors(part->bdev, 0); - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -static void hd_struct_free(struct percpu_ref *ref) -{ - struct hd_struct *part = container_of(ref, struct hd_struct, ref); - struct gendisk *disk = part_to_disk(part); - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(ptbl->last_lookup, NULL); - - INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); - queue_rcu_work(system_wq, &part->rcu_work); -} - -int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL)) - return -ENOMEM; - return 0; -} - /* * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. @@ -342,8 +302,8 @@ void delete_partition(struct hd_struct *part) * ->part_tbl is referenced in this part's release handler, so * we have to hold the disk device */ - get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); @@ -353,7 +313,7 @@ void delete_partition(struct hd_struct *part) */ remove_inode_hash(part->bdev->bd_inode); - percpu_ref_kill(&part->ref); + put_device(part_to_dev(part)); } static ssize_t whole_disk_show(struct device *dev, @@ -406,15 +366,11 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (ptbl->part[partno]) return ERR_PTR(-EBUSY); - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return ERR_PTR(-EBUSY); - bdev = bdev_alloc(disk, partno); if (!bdev) - goto out_free; - p->bdev = bdev; + return ERR_PTR(-ENOMEM); + p = bdev->bd_part; pdev = part_to_dev(p); bdev->bd_start_sect = start; @@ -463,13 +419,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_del; } - err = hd_ref_init(p); - if (err) { - if (flags & ADDPART_FLAG_WHOLEDISK) - goto out_remove_file; - goto out_del; - } - /* everything is up and running, commence */ bdev_add(bdev, devt); rcu_assign_pointer(ptbl->part[partno], p); @@ -481,11 +430,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_bdput: bdput(bdev); -out_free: - kfree(p); return ERR_PTR(err); -out_remove_file: - device_remove_file(pdev, &dev_attr_whole_disk); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); -- cgit From 8446fe9255be821cb38ffd306d7e8edc4b9ea662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:36:54 +0100 Subject: block: switch partition lookup to use struct block_device Use struct block_device to lookup partitions on a disk. This removes all usage of struct hd_struct from the I/O path. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Coly Li [bcache] Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- block/bio.c | 4 +-- block/blk-core.c | 66 ++++++++++++++++++++++--------------------------- block/blk-flush.c | 2 +- block/blk-mq.c | 9 ++++--- block/blk-mq.h | 7 +++--- block/blk.h | 4 +-- block/genhd.c | 57 +++++++++++++++++++++++------------------- block/partitions/core.c | 7 ++---- 8 files changed, 78 insertions(+), 78 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 669bb47a3198..ebb18136b86f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -608,12 +608,12 @@ void bio_truncate(struct bio *bio, unsigned new_size) void guard_bio_eod(struct bio *bio) { sector_t maxsector; - struct hd_struct *part; + struct block_device *part; rcu_read_lock(); part = __disk_get_part(bio->bi_disk, bio->bi_partno); if (part) - maxsector = bdev_nr_sectors(part->bdev); + maxsector = bdev_nr_sectors(part); else maxsector = get_capacity(bio->bi_disk); rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 9ea70275fc1c..cee568389b7e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -666,10 +666,9 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct hd_struct *part, unsigned int bytes) +static bool should_fail_request(struct block_device *part, unsigned int bytes) { - return part->bdev->bd_make_it_fail && - should_fail(&fail_make_request, bytes); + return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -684,7 +683,7 @@ late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool should_fail_request(struct hd_struct *part, +static inline bool should_fail_request(struct block_device *part, unsigned int bytes) { return false; @@ -692,11 +691,11 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +static inline bool bio_check_ro(struct bio *bio, struct block_device *part) { const int op = bio_op(bio); - if (part->bdev->bd_read_only && op_is_write(op)) { + if (part->bd_read_only && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) @@ -704,7 +703,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), part->partno); + bio_devname(bio, b), part->bd_partno); /* Older lvm-tools actually trigger this */ return false; } @@ -714,8 +713,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(bio->bi_disk->part0->bd_part, - bio->bi_iter.bi_size)) + if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -744,7 +742,7 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) */ static inline int blk_partition_remap(struct bio *bio) { - struct hd_struct *p; + struct block_device *p; int ret = -EIO; rcu_read_lock(); @@ -757,12 +755,12 @@ static inline int blk_partition_remap(struct bio *bio) goto out; if (bio_sectors(bio)) { - if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) + if (bio_check_eod(bio, bdev_nr_sectors(p))) goto out; - bio->bi_iter.bi_sector += p->bdev->bd_start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector += p->bd_start_sect; + trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev, bio->bi_iter.bi_sector - - p->bdev->bd_start_sect); + p->bd_start_sect); } bio->bi_partno = 0; ret = 0; @@ -832,7 +830,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part))) + if (unlikely(bio_check_ro(bio, bio->bi_disk->part0))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; @@ -1204,7 +1202,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return ret; if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq))) + should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -1263,17 +1261,18 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) +static void update_io_ticks(struct block_device *part, unsigned long now, + bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->bdev->bd_stamp); + stamp = READ_ONCE(part->bd_stamp); if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp)) + if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } - if (part->partno) { - part = part_to_disk(part)->part0->bd_part; + if (part->bd_partno) { + part = bdev_whole(part); goto again; } } @@ -1282,11 +1281,9 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - part_stat_add(part, sectors[sgrp], bytes >> 9); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -1301,14 +1298,11 @@ void blk_account_io_done(struct request *req, u64 now) if (req->part && blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - - update_io_ticks(part, jiffies, true); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); } } @@ -1325,7 +1319,7 @@ void blk_account_io_start(struct request *rq) part_stat_unlock(); } -static unsigned long __part_start_io_acct(struct hd_struct *part, +static unsigned long __part_start_io_acct(struct block_device *part, unsigned int sectors, unsigned int op) { const int sgrp = op_stat_group(op); @@ -1341,7 +1335,7 @@ static unsigned long __part_start_io_acct(struct hd_struct *part, return now; } -unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, +unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part, struct bio *bio) { *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); @@ -1353,11 +1347,11 @@ EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(disk->part0->bd_part, sectors, op); + return __part_start_io_acct(disk->part0, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); -static void __part_end_io_acct(struct hd_struct *part, unsigned int op, +static void __part_end_io_acct(struct block_device *part, unsigned int op, unsigned long start_time) { const int sgrp = op_stat_group(op); @@ -1371,7 +1365,7 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op, part_stat_unlock(); } -void part_end_io_acct(struct hd_struct *part, struct bio *bio, +void part_end_io_acct(struct block_device *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); @@ -1381,7 +1375,7 @@ EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(disk->part0->bd_part, op, start_time); + __part_end_io_acct(disk->part0, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); diff --git a/block/blk-flush.c b/block/blk-flush.c index fcd0a60574df..9507dcdd5881 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = rq->rq_disk->part0->bd_part; + struct block_device *part = rq->rq_disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); diff --git a/block/blk-mq.c b/block/blk-mq.c index 55bcee5dc032..a2593748fa53 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,7 +95,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, } struct mq_inflight { - struct hd_struct *part; + struct block_device *part; unsigned int inflight[2]; }; @@ -111,7 +111,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part) { struct mq_inflight mi = { .part = part }; @@ -120,8 +121,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) return mi.inflight[0] + mi.inflight[1]; } -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; diff --git a/block/blk-mq.h b/block/blk-mq.h index a52703c98b77..c696515766c7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -182,9 +182,10 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part); +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { diff --git a/block/blk.h b/block/blk.h index 32ac41f7557f..d5bf8f3a0781 100644 --- a/block/blk.h +++ b/block/blk.h @@ -215,7 +215,7 @@ static inline void elevator_exit(struct request_queue *q, __elevator_exit(q, e); } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); +struct block_device *__disk_get_part(struct gendisk *disk, int partno); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -348,7 +348,7 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); int blk_alloc_devt(struct hd_struct *part, dev_t *devt); void blk_free_devt(dev_t devt); diff --git a/block/genhd.c b/block/genhd.c index c35b03dac5e5..ed06466b305d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -126,7 +126,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } } -static unsigned int part_in_flight(struct hd_struct *part) +static unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; @@ -141,7 +141,8 @@ static unsigned int part_in_flight(struct hd_struct *part) return inflight; } -static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) +static void part_in_flight_rw(struct block_device *part, + unsigned int inflight[2]) { int cpu; @@ -157,7 +158,7 @@ static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) inflight[1] = 0; } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) +struct block_device *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); @@ -182,15 +183,21 @@ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) */ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) { + struct block_device *bdev; struct hd_struct *part; rcu_read_lock(); - part = __disk_get_part(disk, partno); - if (part) - get_device(part_to_dev(part)); + bdev = __disk_get_part(disk, partno); + if (!bdev) + goto fail; + part = bdev->bd_part; + if (!kobject_get_unless_zero(&part_to_dev(part)->kobj)) + goto fail; rcu_read_unlock(); - return part; +fail: + rcu_read_unlock(); + return NULL; } /** @@ -264,19 +271,19 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) /* iterate to the next partition */ for (; piter->idx != end; piter->idx += inc) { - struct hd_struct *part; + struct block_device *part; part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!bdev_nr_sectors(part->bdev) && + if (!bdev_nr_sectors(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) continue; - get_device(part_to_dev(part)); - piter->part = part; + get_device(part_to_dev(part->bd_part)); + piter->part = part->bd_part; piter->idx += inc; break; } @@ -303,10 +310,10 @@ void disk_part_iter_exit(struct disk_part_iter *piter) } EXPORT_SYMBOL_GPL(disk_part_iter_exit); -static inline int sector_in_part(struct hd_struct *part, sector_t sector) +static inline int sector_in_part(struct block_device *part, sector_t sector) { - return part->bdev->bd_start_sect <= sector && - sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev); + return part->bd_start_sect <= sector && + sector < part->bd_start_sect + bdev_nr_sectors(part); } /** @@ -324,10 +331,10 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector) * Found partition on success, part0 is returned if no partition matches * or the matched partition is being deleted. */ -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) { struct disk_part_tbl *ptbl; - struct hd_struct *part; + struct block_device *part; int i; rcu_read_lock(); @@ -346,7 +353,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) } } - part = disk->part0->bd_part; + part = disk->part0; out_unlock: rcu_read_unlock(); return part; @@ -882,7 +889,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); - part_stat_set_all(disk->part0->bd_part, 0); + part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); @@ -1189,9 +1196,9 @@ ssize_t part_stat_show(struct device *dev, part_stat_read_all(p, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p); + inflight = blk_mq_in_flight(q, p->bdev); else - inflight = part_in_flight(p); + inflight = part_in_flight(p->bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1231,9 +1238,9 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p, inflight); + blk_mq_in_flight_rw(q, p->bdev, inflight); else - part_in_flight_rw(p, inflight); + part_in_flight_rw(p->bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1506,9 +1513,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) while ((hd = disk_part_iter_next(&piter))) { part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); + inflight = blk_mq_in_flight(gp->queue, hd->bdev); else - inflight = part_in_flight(hd); + inflight = part_in_flight(hd->bdev); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " @@ -1626,7 +1633,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part); + rcu_assign_pointer(ptbl->part[0], disk->part0); disk->minors = minors; rand_initialize_disk(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 6d1fca193cbd..c2f6721633b8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -298,12 +298,9 @@ void delete_partition(struct hd_struct *part) struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - /* - * ->part_tbl is referenced in this part's release handler, so - * we have to hold the disk device - */ rcu_assign_pointer(ptbl->part[part->partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); + kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); @@ -421,7 +418,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, /* everything is up and running, commence */ bdev_add(bdev, devt); - rcu_assign_pointer(ptbl->part[partno], p); + rcu_assign_pointer(ptbl->part[partno], bdev); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) -- cgit From 41e5c81984eac8ce87f2b4f57fec0bd90a049b2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:37:14 +0100 Subject: block: remove the partno field from struct hd_struct Just use the bd_partno field in struct block_device everywhere. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 12 ++++++------ block/partitions/core.c | 9 ++++----- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ed06466b305d..b7e39b41a275 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -589,8 +589,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) int idx; /* in consecutive minor range? */ - if (part->partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->partno); + if (part->bdev->bd_partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + part->bdev->bd_partno); return 0; } @@ -864,7 +864,7 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->partno); + invalidate_partition(disk, part->bdev->bd_partno); delete_partition(part); } disk_part_iter_exit(&piter); @@ -1008,7 +1008,7 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), bdev_nr_sectors(part->bdev) >> 1, - disk_name(disk, part->partno, name_buf), + disk_name(disk, part->bdev->bd_partno, name_buf), part->bdev->bd_meta_info ? part->bdev->bd_meta_info->uuid : ""); if (is_part0) { @@ -1102,7 +1102,7 @@ static int show_partition(struct seq_file *seqf, void *v) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), bdev_nr_sectors(part->bdev) >> 1, - disk_name(sgp, part->partno, buf)); + disk_name(sgp, part->bdev->bd_partno, buf)); disk_part_iter_exit(&piter); return 0; @@ -1525,7 +1525,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %u" "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->partno, buf), + disk_name(gp, hd->bdev->bd_partno, buf), stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], diff --git a/block/partitions/core.c b/block/partitions/core.c index c2f6721633b8..6db9ca8b722d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -184,7 +184,7 @@ static ssize_t part_partition_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->partno); + return sprintf(buf, "%d\n", p->bdev->bd_partno); } static ssize_t part_start_show(struct device *dev, @@ -274,7 +274,7 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { struct hd_struct *part = dev_to_part(dev); - add_uevent_var(env, "PARTN=%u", part->partno); + add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno); if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bdev->bd_meta_info->volname); @@ -298,7 +298,7 @@ void delete_partition(struct hd_struct *part) struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[part->partno], NULL); + rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->bdev->bd_holder_dir); @@ -372,7 +372,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - p->partno = partno; bdev->bd_read_only = get_disk_ro(disk); if (info) { @@ -445,7 +444,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { - if (part->partno == skip_partno || + if (part->bdev->bd_partno == skip_partno || start >= part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev) || start + length <= part->bdev->bd_start_sect) -- cgit From 9fc995a6e08349b5c5baff2cc31544b96ee2b1c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:17:46 +0100 Subject: block: pass a block_device to blk_alloc_devt Pass the block_device actually needed instead of the hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 14 +++++++------- block/partitions/core.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index d5bf8f3a0781..9657c6da7c77 100644 --- a/block/blk.h +++ b/block/blk.h @@ -350,7 +350,7 @@ static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); -int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +int blk_alloc_devt(struct block_device *part, dev_t *devt); void blk_free_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 diff --git a/block/genhd.c b/block/genhd.c index b7e39b41a275..fd6333332ab5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -570,8 +570,8 @@ static int blk_mangle_minor(int minor) } /** - * blk_alloc_devt - allocate a dev_t for a partition - * @part: partition to allocate dev_t for + * blk_alloc_devt - allocate a dev_t for a block device + * @bdev: block device to allocate dev_t for * @devt: out parameter for resulting dev_t * * Allocate a dev_t for block device. @@ -583,14 +583,14 @@ static int blk_mangle_minor(int minor) * CONTEXT: * Might sleep. */ -int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +int blk_alloc_devt(struct block_device *bdev, dev_t *devt) { - struct gendisk *disk = part_to_disk(part); + struct gendisk *disk = bdev->bd_disk; int idx; /* in consecutive minor range? */ - if (part->bdev->bd_partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->bdev->bd_partno); + if (bdev->bd_partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); return 0; } @@ -746,7 +746,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(disk->part0->bd_part, &devt); + retval = blk_alloc_devt(disk->part0, &devt); if (retval) { WARN_ON(1); return; diff --git a/block/partitions/core.c b/block/partitions/core.c index 6db9ca8b722d..3d8243334c7c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -392,7 +392,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev->type = &part_type; pdev->parent = ddev; - err = blk_alloc_devt(p, &devt); + err = blk_alloc_devt(bdev, &devt); if (err) goto out_bdput; pdev->devt = devt; -- cgit From 71773cf797490e1cbe4909b25a2543937e7eea82 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:20:37 +0100 Subject: block: pass a block_device to invalidate_partition Pass the block_device actually needed instead of looking it up using bdget_disk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index fd6333332ab5..452f7c646e02 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -803,14 +803,8 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); -static void invalidate_partition(struct gendisk *disk, int partno) +static void invalidate_partition(struct block_device *bdev) { - struct block_device *bdev; - - bdev = bdget_disk(disk, partno); - if (!bdev) - return; - fsync_bdev(bdev); __invalidate_device(bdev, true); @@ -819,7 +813,6 @@ static void invalidate_partition(struct gendisk *disk, int partno) * up any more even if openers still hold references to it. */ remove_inode_hash(bdev->bd_inode); - bdput(bdev); } /** @@ -864,12 +857,12 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->bdev->bd_partno); + invalidate_partition(part->bdev); delete_partition(part); } disk_part_iter_exit(&piter); - invalidate_partition(disk, 0); + invalidate_partition(disk->part0); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&bdev_lookup_sem); -- cgit From ad1eaa5344b293552b6ba43f5709c76a9aa14d17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:52:59 +0100 Subject: block: switch disk_part_iter_* to use a struct block_device Switch the partition iter infrastructure to iterate over block_device references instead of hd_struct ones mostly used to get at the block_device. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 59 +++++++++++++++++++++++++------------------------ block/partitions/core.c | 13 +++++------ 2 files changed, 36 insertions(+), 36 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 452f7c646e02..2d34dd2da4e9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -244,7 +244,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init); * CONTEXT: * Don't care. */ -struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) +struct block_device *disk_part_iter_next(struct disk_part_iter *piter) { struct disk_part_tbl *ptbl; int inc, end; @@ -282,8 +282,9 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) piter->idx == 0)) continue; - get_device(part_to_dev(part->bd_part)); - piter->part = part->bd_part; + piter->part = bdgrab(part); + if (!piter->part) + continue; piter->idx += inc; break; } @@ -305,7 +306,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_next); */ void disk_part_iter_exit(struct disk_part_iter *piter) { - disk_put_part(piter->part); + if (piter->part) + bdput(piter->part); piter->part = NULL; } EXPORT_SYMBOL_GPL(disk_part_iter_exit); @@ -346,7 +348,6 @@ struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); - if (part && sector_in_part(part, sector)) { rcu_assign_pointer(ptbl->last_lookup, part); goto out_unlock; @@ -647,7 +648,7 @@ static void register_disk(struct device *parent, struct gendisk *disk, { struct device *ddev = disk_to_dev(disk); struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; int err; ddev->parent = parent; @@ -697,7 +698,7 @@ static void register_disk(struct device *parent, struct gendisk *disk, /* announce possible partitions */ disk_part_iter_init(&piter, disk, 0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + kobject_uevent(bdev_kobj(part), KOBJ_ADD); disk_part_iter_exit(&piter); if (disk->queue->backing_dev_info->dev) { @@ -837,7 +838,7 @@ static void invalidate_partition(struct block_device *bdev) void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; might_sleep(); @@ -857,8 +858,8 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(part->bdev); - delete_partition(part); + invalidate_partition(part); + delete_partition(part->bd_part); } disk_part_iter_exit(&piter); @@ -977,7 +978,7 @@ void __init printk_all_partitions(void) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; @@ -996,14 +997,14 @@ void __init printk_all_partitions(void) */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == disk->part0->bd_part; + bool is_part0 = part == disk->part0; printk("%s%s %10llu %s %s", is_part0 ? "" : " ", - bdevt_str(part_devt(part), devt_buf), - bdev_nr_sectors(part->bdev) >> 1, - disk_name(disk, part->bdev->bd_partno, name_buf), - part->bdev->bd_meta_info ? - part->bdev->bd_meta_info->uuid : ""); + bdevt_str(part->bd_dev, devt_buf), + bdev_nr_sectors(part) >> 1, + disk_name(disk, part->bd_partno, name_buf), + part->bd_meta_info ? + part->bd_meta_info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) printk(" driver: %s\n", @@ -1079,7 +1080,7 @@ static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ @@ -1093,9 +1094,9 @@ static int show_partition(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", - MAJOR(part_devt(part)), MINOR(part_devt(part)), - bdev_nr_sectors(part->bdev) >> 1, - disk_name(sgp, part->bdev->bd_partno, buf)); + MAJOR(part->bd_dev), MINOR(part->bd_dev), + bdev_nr_sectors(part) >> 1, + disk_name(sgp, part->bd_partno, buf)); disk_part_iter_exit(&piter); return 0; @@ -1489,7 +1490,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct disk_part_iter piter; - struct hd_struct *hd; + struct block_device *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; @@ -1504,11 +1505,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd, &stat); + part_stat_read_all(hd->bd_part, &stat); if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd->bdev); + inflight = blk_mq_in_flight(gp->queue, hd); else - inflight = part_in_flight(hd->bdev); + inflight = part_in_flight(hd); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " @@ -1517,8 +1518,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->bdev->bd_partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), + disk_name(gp, hd->bd_partno, buf), stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], @@ -1673,7 +1674,7 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) void set_disk_ro(struct gendisk *disk, int flag) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (disk->part0->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); @@ -1682,7 +1683,7 @@ void set_disk_ro(struct gendisk *disk, int flag) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - part->bdev->bd_read_only = flag; + part->bd_read_only = flag; disk_part_iter_exit(&piter); } diff --git a/block/partitions/core.c b/block/partitions/core.c index 3d8243334c7c..4cb6df175f90 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -439,15 +439,14 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; bool overlap = false; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { - if (part->bdev->bd_partno == skip_partno || - start >= part->bdev->bd_start_sect + - bdev_nr_sectors(part->bdev) || - start + length <= part->bdev->bd_start_sect) + if (part->bd_partno == skip_partno || + start >= part->bd_start_sect + bdev_nr_sectors(part) || + start + length <= part->bd_start_sect) continue; overlap = true; break; @@ -568,7 +567,7 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) int blk_drop_partitions(struct block_device *bdev) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (bdev->bd_part_count) return -EBUSY; @@ -578,7 +577,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(part); + delete_partition(part->bd_part); disk_part_iter_exit(&piter); return 0; -- cgit From 0d02129e76edf91cf04fabf1efbc3a9a1f1d729a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Nov 2020 16:43:51 +0100 Subject: block: merge struct block_device and struct hd_struct Instead of having two structures that represent each block device with different life time rules, merge them into a single one. This also greatly simplifies the reference counting rules, as we can use the inode reference count as the main reference count for the new struct block_device, with the device model reference front ending it for device model interaction. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 ++-- block/blk.h | 2 +- block/genhd.c | 90 +++++++++++-------------------------- block/partitions/core.c | 116 ++++++++++++++++++------------------------------ 4 files changed, 75 insertions(+), 141 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 79aa96240cec..031114d454a6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -820,9 +820,9 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part = disk_get_part(disk, 0); - struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct block_device *bdev = dev_to_bdev(dev); + struct blkcg_gq *blkg = + blk_queue_root_blkg(bdev->bd_disk->queue); struct blkg_iostat tmp; int cpu; @@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void) for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu); + cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += diff --git a/block/blk.h b/block/blk.h index 9657c6da7c77..98f0b1ae2641 100644 --- a/block/blk.h +++ b/block/blk.h @@ -356,7 +356,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct hd_struct *part); +void delete_partition(struct block_device *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/genhd.c b/block/genhd.c index 2d34dd2da4e9..0fabfc90b8e4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -106,13 +106,14 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +static void part_stat_read_all(struct block_device *part, + struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -167,39 +168,6 @@ struct block_device *__disk_get_part(struct gendisk *disk, int partno) return rcu_dereference(ptbl->part[partno]); } -/** - * disk_get_part - get partition - * @disk: disk to look partition from - * @partno: partition number - * - * Look for partition @partno from @disk. If found, increment - * reference count and return it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Pointer to the found partition on success, NULL if not found. - */ -struct hd_struct *disk_get_part(struct gendisk *disk, int partno) -{ - struct block_device *bdev; - struct hd_struct *part; - - rcu_read_lock(); - bdev = __disk_get_part(disk, partno); - if (!bdev) - goto fail; - part = bdev->bd_part; - if (!kobject_get_unless_zero(&part_to_dev(part)->kobj)) - goto fail; - rcu_read_unlock(); - return part; -fail: - rcu_read_unlock(); - return NULL; -} - /** * disk_part_iter_init - initialize partition iterator * @piter: iterator to initialize @@ -859,7 +827,7 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(part); - delete_partition(part->bd_part); + delete_partition(part); } disk_part_iter_exit(&piter); @@ -952,13 +920,13 @@ void blk_request_module(dev_t devt) */ struct block_device *bdget_disk(struct gendisk *disk, int partno) { - struct hd_struct *part; struct block_device *bdev = NULL; - part = disk_get_part(disk, partno); - if (part) - bdev = bdget_part(part); - disk_put_part(part); + rcu_read_lock(); + bdev = __disk_get_part(disk, partno); + if (bdev && !bdgrab(bdev)) + bdev = NULL; + rcu_read_unlock(); return bdev; } @@ -1175,24 +1143,22 @@ static ssize_t disk_ro_show(struct device *dev, ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; struct disk_stats stat; unsigned int inflight; - part_stat_read_all(p, &stat); + part_stat_read_all(bdev, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p->bdev); + inflight = blk_mq_in_flight(q, bdev); else - inflight = part_in_flight(p->bdev); + inflight = part_in_flight(bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1227,14 +1193,14 @@ ssize_t part_stat_show(struct device *dev, ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p->bdev, inflight); + blk_mq_in_flight_rw(q, bdev, inflight); else - part_in_flight_rw(p->bdev, inflight); + part_in_flight_rw(bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1282,20 +1248,17 @@ static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct hd_struct *p = dev_to_part(dev); int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->bdev->bd_make_it_fail = i; + dev_to_bdev(dev)->bd_make_it_fail = i; return count; } @@ -1505,7 +1468,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd->bd_part, &stat); + part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else @@ -1577,7 +1540,7 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part; + struct block_device *part; if (strcmp(dev_name(dev), name)) continue; @@ -1590,13 +1553,12 @@ dev_t blk_lookup_devt(const char *name, int partno) MINOR(dev->devt) + partno); break; } - part = disk_get_part(disk, partno); + part = bdget_disk(disk, partno); if (part) { - devt = part_devt(part); - disk_put_part(part); + devt = part->bd_dev; + bdput(part); break; } - disk_put_part(part); } class_dev_iter_exit(&iter); return devt; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4cb6df175f90..deca253583bd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -182,44 +182,39 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_partno); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); + return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->bdev->bd_read_only); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -264,20 +259,17 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - struct hd_struct *p = dev_to_part(dev); - blk_free_devt(dev->devt); - bdput(p->bdev); + bdput(dev_to_bdev(dev)); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { - struct hd_struct *part = dev_to_part(dev); + struct block_device *part = dev_to_bdev(dev); - add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno); - if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", - part->bdev->bd_meta_info->volname); + add_uevent_var(env, "PARTN=%u", part->bd_partno); + if (part->bd_meta_info && part->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); return 0; } @@ -292,25 +284,25 @@ struct device_type part_type = { * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct hd_struct *part) +void delete_partition(struct block_device *part) { - struct gendisk *disk = part_to_disk(part); + struct gendisk *disk = part->bd_disk; struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL); + rcu_assign_pointer(ptbl->part[part->bd_partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); - kobject_put(part->bdev->bd_holder_dir); - device_del(part_to_dev(part)); + kobject_put(part->bd_holder_dir); + device_del(&part->bd_device); /* * Remove the block device from the inode hash, so that it cannot be * looked up any more even when openers still hold references. */ - remove_inode_hash(part->bdev->bd_inode); + remove_inode_hash(part->bd_inode); - put_device(part_to_dev(part)); + put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, @@ -324,11 +316,10 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -static struct hd_struct *add_partition(struct gendisk *disk, int partno, +static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { - struct hd_struct *p; dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; @@ -367,9 +358,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!bdev) return ERR_PTR(-ENOMEM); - p = bdev->bd_part; - pdev = part_to_dev(p); - bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); bdev->bd_read_only = get_disk_ro(disk); @@ -381,6 +369,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_bdput; } + pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -422,7 +411,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - return p; + return bdev; out_bdput: bdput(bdev); @@ -459,7 +448,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct hd_struct *part; + struct block_device *part; mutex_lock(&bdev->bd_mutex); if (partition_overlaps(bdev->bd_disk, start, length, -1)) { @@ -475,76 +464,59 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno) { - struct block_device *bdevp; - struct hd_struct *part = NULL; + struct block_device *part; int ret; - bdevp = bdget_disk(bdev->bd_disk, partno); - if (!bdevp) + part = bdget_disk(bdev->bd_disk, partno); + if (!part) return -ENXIO; - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); - if (!part) - goto out_unlock; - ret = -EBUSY; - if (bdevp->bd_openers) + if (part->bd_openers) goto out_unlock; - sync_blockdev(bdevp); - invalidate_bdev(bdevp); + sync_blockdev(part); + invalidate_bdev(part); delete_partition(part); ret = 0; out_unlock: mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); - if (part) - disk_put_part(part); + mutex_unlock(&part->bd_mutex); + bdput(part); return ret; } int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct block_device *bdevp; - struct hd_struct *part; + struct block_device *part; int ret = 0; - part = disk_get_part(bdev->bd_disk, partno); + part = bdget_disk(bdev->bd_disk, partno); if (!part) return -ENXIO; - ret = -ENOMEM; - bdevp = bdget_part(part); - if (!bdevp) - goto out_put_part; - - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -EINVAL; - if (start != part->bdev->bd_start_sect) + if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - bdev_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&part->bd_mutex); mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); -out_put_part: - disk_put_part(part); + bdput(part); return ret; } @@ -577,7 +549,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(part->bd_part); + delete_partition(part); disk_part_iter_exit(&piter); return 0; @@ -592,7 +564,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; - struct hd_struct *part; + struct block_device *part; if (!size) return true; @@ -632,7 +604,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part_to_dev(part)->devt); + md_autodetect_dev(part->bd_dev); return true; } -- cgit From 977115c0f664e016a6b2774d4f97116ade23d732 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 10:41:07 +0100 Subject: block: stop using bdget_disk for partition 0 We can just dereference the point in struct gendisk instead. Also remove the now unused export. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 0fabfc90b8e4..b84b8671e627 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -930,7 +930,6 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno) return bdev; } -EXPORT_SYMBOL(bdget_disk); /* * print a full list of all partitions - intended for places where the root -- cgit From 22b56c2964386ddced252be407150b22f85e209e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Nov 2020 17:58:13 +0000 Subject: bio: optimise bvec iteration __bio_for_each_bvec(), __bio_for_each_segment() and bio_copy_data_iter() fall under conditions of bvec_iter_advance_single(), which is a faster and slimmer version of bvec_iter_advance(). Add bio_advance_iter_single() and convert them. Signed-off-by: Pavel Begunkov Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index ebb18136b86f..1f2cc1fbe283 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1212,8 +1212,8 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, flush_dcache_page(dst_bv.bv_page); - bio_advance_iter(src, src_iter, bytes); - bio_advance_iter(dst, dst_iter, bytes); + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); -- cgit From b0d97557ebfc9d5ba5f2939339a9fdd267abafeb Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Wed, 2 Dec 2020 19:11:45 +0800 Subject: block: fix inflight statistics of part0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inflight of partition 0 doesn't include inflight IOs to all sub-partitions, since currently mq calculates inflight of specific partition by simply camparing the value of the partition pointer. Thus the following case is possible: $ cat /sys/block/vda/inflight        0        0 $ cat /sys/block/vda/vda1/inflight        0      128 While single queue device (on a previous version, e.g. v3.10) has no this issue: $cat /sys/block/sda/sda3/inflight 0 33 $cat /sys/block/sda/inflight 0 33 Partition 0 should be specially handled since it represents the whole disk. This issue is introduced since commit bf0ddaba65dd ("blk-mq: fix sysfs inflight counter"). Besides, this patch can also fix the inflight statistics of part 0 in /proc/diskstats. Before this patch, the inflight statistics of part 0 doesn't include that of sub partitions. (I have marked the 'inflight' field with asterisk.) $cat /proc/diskstats 259 0 nvme0n1 45974469 0 367814768 6445794 1 0 1 0 *0* 111062 6445794 0 0 0 0 0 0 259 2 nvme0n1p1 45974058 0 367797952 6445727 0 0 0 0 *33* 111001 6445727 0 0 0 0 0 0 This is introduced since commit f299b7c7a9de ("blk-mq: provide internal in-flight variant"). Fixes: bf0ddaba65dd ("blk-mq: fix sysfs inflight counter") Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant") Signed-off-by: Jeffle Xu Reviewed-by: Christoph Hellwig [axboe: adapt for 5.11 partition change] Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a2593748fa53..37c682855a63 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -105,7 +105,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (rq->part == mi->part && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) + if ((!mi->part->bd_partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; -- cgit From acaf523a7bf226b28504306c1cfee194520123b3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 26 Nov 2020 11:18:34 +0800 Subject: blk-throttle: don't check whether or not lower limit is valid if CONFIG_BLK_DEV_THROTTLING_LOW is off blk_throtl_update_limit_valid() will search for descendants to see if 'LIMIT_LOW' of bps/iops and READ/WRITE is nonzero. However, they're always zero if CONFIG_BLK_DEV_THROTTLING_LOW is not set, furthermore, a lot of time will be wasted to iterate descendants. Thus do nothing in blk_throtl_update_limit_valid() in such situation. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b771c4299982..d52cac9f3a7c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -587,6 +587,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void blk_throtl_update_limit_valid(struct throtl_data *td) { struct cgroup_subsys_state *pos_css; @@ -607,6 +608,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) td->limit_valid[LIMIT_LOW] = low_valid; } +#else +static inline void blk_throtl_update_limit_valid(struct throtl_data *td) +{ +} +#endif static void throtl_upgrade_state(struct throtl_data *td); static void throtl_pd_offline(struct blkg_policy_data *pd) -- cgit From e8a676d61c07eccfcd9d6fddfe4dcb630651c29a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:36 +0100 Subject: block: simplify and extend the block_bio_merge tracepoint class The block_bio_merge tracepoint class can be reused for most bio-based tracepoints. For that it just needs to lose the superfluous q and rq parameters. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 4 ++-- block/blk-mq.c | 2 +- block/bounce.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index cee568389b7e..cb24654983e1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -907,7 +907,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_queue(q, bio); + trace_block_bio_queue(bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index cb351ab9b77d..1a46d5bbd399 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -922,7 +922,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, if (!ll_back_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_backmerge(req->q, req, bio); + trace_block_bio_backmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) @@ -946,7 +946,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_frontmerge(req->q, req, bio); + trace_block_bio_frontmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) diff --git a/block/blk-mq.c b/block/blk-mq.c index 37c682855a63..21e2b4b6b742 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2184,7 +2184,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - trace_block_getrq(q, bio, bio->bi_opf); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); diff --git a/block/bounce.c b/block/bounce.c index 162a6eee8999..d3f51acd6e3b 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -340,7 +340,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, } } - trace_block_bio_bounce(q, *bio_orig); + trace_block_bio_bounce(*bio_orig); bio->bi_flags |= (1 << BIO_BOUNCED); -- cgit From eb6f7f7cd3af0f67ce57b21fab1bc64beb643581 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:37 +0100 Subject: block: remove the request_queue argument to the block_split tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 1a46d5bbd399..4071daa88a5e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -338,7 +338,7 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) split->bi_opf |= REQ_NOMERGE; bio_chain(split, *bio); - trace_block_split(q, split, (*bio)->bi_iter.bi_sector); + trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; } -- cgit From 1c02fca620f7273b597591065d366e2cca948d8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:38 +0100 Subject: block: remove the request_queue argument to the block_bio_remap tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index cb24654983e1..96e5fcd7f071 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -758,7 +758,7 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_check_eod(bio, bdev_nr_sectors(p))) goto out; bio->bi_iter.bi_sector += p->bd_start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev, + trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } -- cgit From a54895fa057c67700270777f7661d8d3c7fda88a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:39 +0100 Subject: block: remove the request_queue to argument request based tracepoints The request_queue can trivially be derived from the request. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-mq-sched.c | 2 +- block/blk-mq.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 4071daa88a5e..7497d86fff38 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -799,7 +799,7 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); - trace_block_rq_merge(q, next); + trace_block_rq_merge(next); /* * ownership of bio passed from next to req, return 'next' for diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d1eafe2c045c..deff4e826e23 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); void blk_mq_sched_request_inserted(struct request *rq) { - trace_block_rq_insert(rq->q, rq); + trace_block_rq_insert(rq); } EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); diff --git a/block/blk-mq.c b/block/blk-mq.c index 21e2b4b6b742..cf3916e2852f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -733,7 +733,7 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - trace_block_rq_issue(q, rq); + trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); @@ -760,7 +760,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); - trace_block_rq_requeue(q, rq); + trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { @@ -1821,7 +1821,7 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, lockdep_assert_held(&ctx->lock); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_lists[type]); @@ -1878,7 +1878,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); } spin_lock(&ctx->lock); -- cgit From 5ba1add216fe82289769045627d97f233bbcc645 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:11 +0800 Subject: blk-iocost: Fix some typos in comments Fix some typos in comments. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8e20fe4bddec..087ae215529e 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -39,7 +39,7 @@ * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default - * paramters for several different classes of devices are provided and the + * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * @@ -77,7 +77,7 @@ * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks - * the vtime consumed by past IOs and can issue a new IO iff doing so + * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * @@ -155,7 +155,7 @@ * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see - * https://github.com/osandov/drgn. The ouput looks like the following. + * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% @@ -492,7 +492,7 @@ struct ioc_gq { /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents - * the currently available IO budget. If runnning ahead, the + * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather @@ -1046,7 +1046,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* * The delta between inuse and active sums indicates that - * that much of weight is being given away. Parent's inuse + * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { @@ -2400,7 +2400,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, return cost; /* - * We only increase inuse during period and do so iff the margin has + * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || -- cgit From 647c9f03b2b66cf1f505208c313998fc833ed28b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:12 +0800 Subject: blk-iocost: Remove unnecessary advance declaration Remove unnecessary advance declaration of struct ioc_gq. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 087ae215529e..ec4865206353 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -370,8 +370,6 @@ enum { AUTOP_SSD_FAST, }; -struct ioc_gq; - struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; -- cgit From c09245f61c6ac4ef253a5fcf97e5bcfc0ce25fc7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:13 +0800 Subject: blk-iocost: Move the usage ratio calculation to the correct place We only use the hweight based usage ratio to calculate the new hweight_inuse of the iocg to decide if this iocg can donate some surplus vtime. Thus move the usage ratio calculation to the correct place to avoid unnecessary calculation for some vtime shortage iocgs. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ec4865206353..09f22f9a6ba4 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2168,8 +2168,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, usage_dur; - u32 usage, hw_active, hw_inuse; + u64 vdone, vtime, usage_us; + u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -2200,30 +2200,32 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; - if (vdone != vtime) { - u64 inflight_us = DIV64_U64_ROUND_UP( - cost_to_abs_cost(vtime - vdone, hw_inuse), - ioc->vtime_base_rate); - usage_us = max(usage_us, inflight_us); - } - - /* convert to hweight based usage ratio */ - if (time_after64(iocg->activated_at, ioc->period_at)) - usage_dur = max_t(u64, now.now - iocg->activated_at, 1); - else - usage_dur = max_t(u64, now.now - ioc->period_at, 1); - - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); - /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, old_hwi, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi, usage; + u64 usage_dur; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. -- cgit From 2474787a75b4f358e81f367653c73edecd67aa2d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:14 +0800 Subject: blk-iocost: Factor out the active iocgs' state check into a separate function Factor out the iocgs' state check into a separate function to simplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 94 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 40 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 09f22f9a6ba4..7dd1424d5833 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2069,40 +2069,21 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, } } -static void ioc_timer_fn(struct timer_list *timer) +/* + * Check the active iocgs' state to avoid oversleeping and deactive + * idle iocgs. + * + * Since waiters determine the sleep durations based on the vrate + * they saw at the time of sleep, if vrate has increased, some + * waiters could be sleeping for too long. Wake up tardy waiters + * which should have woken up in the last period and expire idle + * iocgs. + */ +static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { - struct ioc *ioc = container_of(timer, struct ioc, timer); + int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; - struct ioc_now now; - LIST_HEAD(surpluses); - int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; - u64 usage_us_sum = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; - u32 missed_ppm[2], rq_wait_pct; - u64 period_vtime; - int prev_busy_level; - - /* how were the latencies during the period? */ - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); - /* take care of active iocgs */ - spin_lock_irq(&ioc->lock); - - ioc_now(ioc, &now); - - period_vtime = now.vnow - ioc->period_at_vtime; - if (WARN_ON_ONCE(!period_vtime)) { - spin_unlock_irq(&ioc->lock); - return; - } - - /* - * Waiters determine the sleep durations based on the vrate they - * saw at the time of sleep. If vrate has increased, some waiters - * could be sleeping for too long. Wake up tardy waiters which - * should have woken up in the last period and expire idle iocgs. - */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) @@ -2112,24 +2093,24 @@ static void ioc_timer_fn(struct timer_list *timer) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now.now - iocg->wait_since; - iocg->wait_since = now.now; + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->local_stat.indebt_us += - now.now - iocg->indebt_since; - iocg->indebt_since = now.now; + now->now - iocg->indebt_since; + iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->local_stat.indelay_us += - now.now - iocg->indelay_since; - iocg->indelay_since = now.now; + now->now - iocg->indelay_since; + iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, true, &now); + iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { @@ -2143,7 +2124,7 @@ static void ioc_timer_fn(struct timer_list *timer) * error and throw away. On reactivation, it'll start * with the target budget. */ - excess = now.vnow - vtime - ioc->margins.target; + excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; @@ -2152,13 +2133,46 @@ static void ioc_timer_fn(struct timer_list *timer) WEIGHT_ONE); } - __propagate_weights(iocg, 0, 0, false, &now); + __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } + commit_weights(ioc); + return nr_debtors; +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + LIST_HEAD(surpluses); + int nr_debtors, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; + u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int prev_busy_level; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation -- cgit From 926f75f6a9ef503d45dced061e304d0324beeba1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:15 +0800 Subject: blk-iocost: Factor out the base vrate change into a separate function Factor out the base vrate change code into a separate function to fimplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 99 +++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 45 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7dd1424d5833..ffa418c0dcb1 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -971,6 +971,58 @@ done: ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } +static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, + int nr_lagging, int nr_shortages, + int prev_busy_level, u32 *missed_ppm) +{ + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { + if (ioc->busy_level != prev_busy_level || nr_lagging) + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + return; + } + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -2323,51 +2375,8 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = ioc->vtime_base_rate; - u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; - - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - - /* - * If vrate is out of bounds, apply clamp gradually as the - * bounds can change abruptly. Otherwise, apply busy_level - * based adjustment. - */ - if (vrate < vrate_min) { - vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), - 100); - vrate = min(vrate, vrate_min); - } else if (vrate > vrate_max) { - vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), - 100); - vrate = max(vrate, vrate_max); - } else { - int idx = min_t(int, abs(ioc->busy_level), - ARRAY_SIZE(vrate_adj_pct) - 1); - u32 adj_pct = vrate_adj_pct[idx]; - - if (ioc->busy_level > 0) - adj_pct = 100 - adj_pct; - else - adj_pct = 100 + adj_pct; - - vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), - vrate_min, vrate_max); - } - - trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages); - - ioc->vtime_base_rate = vrate; - ioc_refresh_margins(ioc); - } else if (ioc->busy_level != prev_busy_level || nr_lagging) { - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), - missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages); - } + ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, + prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); -- cgit From f6f371f7db42917c7b2a861c4fc923cb352ce5a1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 14:04:39 +0000 Subject: blk-mq: skip hybrid polling if iopoll doesn't spin If blk_poll() is not going to spin (i.e. @spin=false), it also must not sleep in hybrid polling, otherwise it might be pretty suprising for users trying to do a quick check and expecting no-wait behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index cf3916e2852f..2881a457de83 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3865,9 +3865,10 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) * the state. Like for the other success return cases, the * caller is responsible for checking if the IO completed. If * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. + * straight to the busy poll loop. If specified not to spin, + * we also should not sleep. */ - if (blk_mq_poll_hybrid(q, hctx, cookie)) + if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) return 1; hctx->poll_considered++; -- cgit From 2afdeb23e4750acb4ff16fd86f566c9074708691 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Nov 2020 16:36:06 +0900 Subject: block: Improve blk_revalidate_disk_zones() checks Improves the checks on the zones of a zoned block device done in blk_revalidate_disk_zones() by making sure that the device report_zones method did report at least one zone and that the zones reported exactly cover the entire disk capacity, that is, that there are no missing zones at the end of the disk sector range. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 6817a673e5ce..7a68b6e4300c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -508,15 +508,29 @@ int blk_revalidate_disk_zones(struct gendisk *disk, noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); + if (!ret) { + pr_warn("%s: No zones reported\n", disk->disk_name); + ret = -ENODEV; + } memalloc_noio_restore(noio_flag); + /* + * If zones where reported, make sure that the entire disk capacity + * has been checked. + */ + if (ret > 0 && args.sector != get_capacity(disk)) { + pr_warn("%s: Missing zones from sector %llu\n", + disk->disk_name, args.sector); + ret = -ENODEV; + } + /* * Install the new bitmaps and update nr_zones only once the queue is * stopped and all I/Os are completed (i.e. a scheduler is not * referencing the bitmaps). */ blk_mq_freeze_queue(q); - if (ret >= 0) { + if (ret > 0) { blk_queue_chunk_sectors(q, args.zone_sectors); q->nr_zones = args.nr_zones; swap(q->seq_zones_wlock, args.seq_zones_wlock); -- cgit From cc29e1bf0d63f728a5bd60ef22638bbf77369552 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 26 Nov 2020 17:18:52 +0800 Subject: block: disable iopoll for split bio iopoll is initially for small size, latency sensitive IO. It doesn't work well for big IO, especially when it needs to be split to multiple bios. In this case, the returned cookie of __submit_bio_noacct_mq() is indeed the cookie of the last split bio. The completion of *this* last split bio done by iopoll doesn't mean the whole original bio has completed. Callers of iopoll still need to wait for completion of other split bios. Besides bio splitting may cause more trouble for iopoll which isn't supposed to be used in case of big IO. iopoll for split bio may cause potential race if CPU migration happens during bio submission. Since the returned cookie is that of the last split bio, polling on the corresponding hardware queue doesn't help complete other split bios, if these split bios are enqueued into different hardware queues. Since interrupts are disabled for polling queues, the completion of these other split bios depends on timeout mechanism, thus causing a potential hang. iopoll for split bio may also cause hang for sync polling. Currently both the blkdev and iomap-based fs (ext4/xfs, etc) support sync polling in direct IO routine. These routines will submit bio without REQ_NOWAIT flag set, and then start sync polling in current process context. The process may hang in blk_mq_get_tag() if the submitted bio has to be split into multiple bios and can rapidly exhaust the queue depth. The process are waiting for the completion of the previously allocated requests, which should be reaped by the following polling, and thus causing a deadlock. To avoid these subtle trouble described above, just disable iopoll for split bio and return BLK_QC_T_NONE in this case. The side effect is that non-HIPRI IO also returns BLK_QC_T_NONE now. It should be acceptable since the returned cookie is never used for non-HIPRI IO. Suggested-by: Ming Lei Signed-off-by: Jeffle Xu Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++++ block/blk-mq.c | 5 +++++ 2 files changed, 13 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 7497d86fff38..c3399bf29e9c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -279,6 +279,14 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, return NULL; split: *segs = nsegs; + + /* + * Bio splitting may cause subtle trouble such as hang when doing sync + * iopoll in direct IO routine. Given performance gain of iopoll for + * big IO can be trival, disable iopoll when split needed. + */ + bio->bi_opf &= ~REQ_HIPRI; + return bio_split(bio, sectors, GFP_NOIO, bs); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 2881a457de83..95ecc4c69969 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2159,6 +2159,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) unsigned int nr_segs; blk_qc_t cookie; blk_status_t ret; + bool hipri; blk_queue_bounce(q, &bio); __blk_queue_split(&bio, &nr_segs); @@ -2175,6 +2176,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) rq_qos_throttle(q, bio); + hipri = bio->bi_opf & REQ_HIPRI; + data.cmd_flags = bio->bi_opf; rq = __blk_mq_alloc_request(&data); if (unlikely(!rq)) { @@ -2267,6 +2270,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) blk_mq_sched_insert_request(rq, false, true, true); } + if (!hipri) + return BLK_QC_T_NONE; return cookie; queue_exit: blk_queue_exit(q); -- cgit From fb01a2932e81a1fb2273f87ff92dc8172b8880ee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Dec 2020 09:26:36 +0800 Subject: blk-mq: add new API of blk_mq_hctx_set_fq_lock_class flush_end_io() may be called recursively from some driver, such as nvme-loop, so lockdep may complain 'possible recursive locking'. Commit b3c6a5997541("block: Fix a lockdep complaint triggered by request queue flushing") tried to address this issue by assigning dynamically allocated per-flush-queue lock class. This solution adds synchronize_rcu() for each hctx's release handler, and causes horrible SCSI MQ probe delay(more than half an hour on megaraid sas). Add new API of blk_mq_hctx_set_fq_lock_class() for these drivers, so we just need to use driver specific lock class for avoiding the lockdep warning of 'possible recursive locking'. Tested-by: Kashyap Desai Reported-by: Qian Cai Cc: Sumit Saxena Cc: John Garry Cc: Kashyap Desai Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-flush.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 9507dcdd5881..bf51588762d8 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -490,3 +490,28 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) kfree(fq->flush_rq); kfree(fq); } + +/* + * Allow driver to set its own lock class to fq->mq_flush_lock for + * avoiding lockdep complaint. + * + * flush_end_io() may be called recursively from some driver, such as + * nvme-loop, so lockdep may complain 'possible recursive locking' because + * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class + * key. We need to assign different lock class for these driver's + * fq->mq_flush_lock for avoiding the lockdep warning. + * + * Use dynamically allocated lock class key for each 'blk_flush_queue' + * instance is over-kill, and more worse it introduces horrible boot delay + * issue because synchronize_rcu() is implied in lockdep_unregister_key which + * is called for each hctx release. SCSI probing may synchronously create and + * destroy lots of MQ request_queues for non-existent devices, and some robot + * test kernel always enable lockdep option. It is observed that more than half + * an hour is taken during SCSI MQ probe with per-fq lock class. + */ +void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, + struct lock_class_key *key) +{ + lockdep_set_class(&hctx->fq->mq_flush_lock, key); +} +EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); -- cgit From 7aa390ec2d9db0cd6677d95d0b8f307f9c086770 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Dec 2020 09:26:38 +0800 Subject: Revert "block: Fix a lockdep complaint triggered by request queue flushing" This reverts commit b3c6a59975415bde29cfd76ff1ab008edbf614a9. Now we can avoid nvme-loop lockdep warning of 'lockdep possible recursive locking' by nvme-loop's lock class, no need to apply dynamically allocated lock class key, so revert commit b3c6a5997541("block: Fix a lockdep complaint triggered by request queue flushing"). This way fixes horrible SCSI probe delay issue on megaraid_sas, and it is reported the whole probe may take more than half an hour. Tested-by: Kashyap Desai Reported-by: Qian Cai Reviewed-by: Christoph Hellwig Cc: Sumit Saxena Cc: John Garry Cc: Kashyap Desai Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-flush.c | 5 ----- block/blk.h | 1 - 2 files changed, 6 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index bf51588762d8..996d5d03dade 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,7 +69,6 @@ #include #include #include -#include #include "blk.h" #include "blk-mq.h" @@ -469,9 +468,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_data_in_flight); - lockdep_register_key(&fq->key); - lockdep_set_class(&fq->mq_flush_lock, &fq->key); - return fq; fail_rq: @@ -486,7 +482,6 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return; - lockdep_unregister_key(&fq->key); kfree(fq->flush_rq); kfree(fq); } diff --git a/block/blk.h b/block/blk.h index 98f0b1ae2641..d23d018fd2cd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,7 +25,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - struct lock_class_key key; spinlock_t mq_flush_lock; }; -- cgit From 91cdf265b74bf63a69949d6db08a60523207400c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:53 +0900 Subject: blk-mq: add helper allocating tagset->tags tagset->set is allocated from blk_mq_alloc_tag_set() rather than being reallocated. This patch added a helper to make its meaning explicitly which is to allocate rather than to reallocate. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 95ecc4c69969..e2bd9ef81d55 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3382,6 +3382,12 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return 0; } +static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, + int new_nr_hw_queues) +{ + return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3435,7 +3441,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) + if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; -- cgit From d220a21410e445324b8ae67d93f9c51406f99a29 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:54 +0900 Subject: blk-mq: update arg in comment of blk_mq_map_queue Update mis-named argument description of blk_mq_map_queue(). This patch also updates description that argument to software queue percpu context. Signed-off-by: Minwoo Im Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.h b/block/blk-mq.h index c696515766c7..c1458d9502f1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -99,7 +99,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @flags: request command flags - * @cpu: cpu ctx + * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, -- cgit From fa94ba8a7b22890e6a17b39b9359e114fe18cd59 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:55 +0900 Subject: blk-mq: fix msec comment from micro to milli seconds Delay to wait for queue running is milli second unit which is passed to delayed work via msecs_to_jiffies() which is to convert milliseconds to jiffies. Signed-off-by: Minwoo Im Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index e2bd9ef81d55..6f207ec9ef83 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1594,7 +1594,7 @@ select_cpu: * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * If !@async, try to run the queue now. Else, run the queue asynchronously and * with a delay of @msecs. @@ -1623,7 +1623,7 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ @@ -1687,7 +1687,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. - * @msecs: Microseconds of delay to wait before running the queues. + * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { -- cgit