From b9aef63aca772a25ffcfaedf4bd29fb36b919a93 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 4 Jun 2019 16:23:40 +0900 Subject: block: force select mq-deadline for zoned block devices In most use cases of zoned block devices (aka SMR disks), the mq-deadline scheduler is mandatory as it implements sequential write command processing guarantees with zone write locking. So make sure that this scheduler is always enabled if CONFIG_BLK_DEV_ZONED is selected. Tested-by: Chaitanya Kulkarni Reviewed-by: Chaitanya Kulkarni Signed-off-by: Damien Le Moal Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/block/Kconfig b/block/Kconfig index 1b220101a9cb..2466dcc3ef1d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -73,6 +73,7 @@ config BLK_DEV_INTEGRITY config BLK_DEV_ZONED bool "Zoned block device support" + select MQ_IOSCHED_DEADLINE ---help--- Block layer zoned block device support. This option enables support for ZAC/ZBC host-managed and host-aware zoned block devices. -- cgit From 355e8d26f719c207aa2e00e6f3cfab3acf21769b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 12 Jun 2019 14:58:43 -0700 Subject: io_uring: fix memory leak of UNIX domain socket inode Opening and closing an io_uring instance leaks a UNIX domain socket inode. This is because the ->file of the io_uring instance's internal UNIX domain socket is set to point to the io_uring file, but then sock_release() sees the non-NULL ->file and assumes the inode reference is held by the file so doesn't call iput(). That's not the case here, since the reference is still meant to be held by the socket; the actual inode of the io_uring file is different. Fix this leak by NULL-ing out ->file before releasing the socket. Reported-by: syzbot+111cb28d9f583693aefa@syzkaller.appspotmail.com Fixes: 2b188cc1bb85 ("Add io_uring IO interface") Cc: # v5.1+ Signed-off-by: Eric Biggers Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0fbb486a320e..86a2bd721900 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2777,8 +2777,10 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_eventfd_unregister(ctx); #if defined(CONFIG_UNIX) - if (ctx->ring_sock) + if (ctx->ring_sock) { + ctx->ring_sock->file = NULL; /* so that iput() is called */ sock_release(ctx->ring_sock); + } #endif io_mem_free(ctx->sq_ring); -- cgit From 6cfc0081b046ebf50dd38c38e688c8de143614f3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 12 Jun 2019 14:30:19 +0200 Subject: blk-mq: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. When all of these checks are cleaned up, lots of the functions used in the blk-mq-debugfs code can now return void, as no need to check the return value of them either. Overall, this ends up cleaning up the code and making it smaller, always a nice win. Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 145 ++++++++++++------------------------------------- block/blk-mq-debugfs.h | 36 +++++------- 2 files changed, 49 insertions(+), 132 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 6aea0ebc3a73..2489ddbb21db 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -821,38 +821,28 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, }; -static bool debugfs_create_files(struct dentry *parent, void *data, +static void debugfs_create_files(struct dentry *parent, void *data, const struct blk_mq_debugfs_attr *attr) { if (IS_ERR_OR_NULL(parent)) - return false; + return; d_inode(parent)->i_private = data; - for (; attr->name; attr++) { - if (!debugfs_create_file(attr->name, attr->mode, parent, - (void *)attr, &blk_mq_debugfs_fops)) - return false; - } - return true; + for (; attr->name; attr++) + debugfs_create_file(attr->name, attr->mode, parent, + (void *)attr, &blk_mq_debugfs_fops); } -int blk_mq_debugfs_register(struct request_queue *q) +void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; - if (!blk_debugfs_root) - return -ENOENT; - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), blk_debugfs_root); - if (!q->debugfs_dir) - return -ENOMEM; - if (!debugfs_create_files(q->debugfs_dir, q, - blk_mq_debugfs_queue_attrs)) - goto err; + debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); /* * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir @@ -864,11 +854,10 @@ int blk_mq_debugfs_register(struct request_queue *q) /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx)) - goto err; - if (q->elevator && !hctx->sched_debugfs_dir && - blk_mq_debugfs_register_sched_hctx(q, hctx)) - goto err; + if (!hctx->debugfs_dir) + blk_mq_debugfs_register_hctx(q, hctx); + if (q->elevator && !hctx->sched_debugfs_dir) + blk_mq_debugfs_register_sched_hctx(q, hctx); } if (q->rq_qos) { @@ -879,12 +868,6 @@ int blk_mq_debugfs_register(struct request_queue *q) rqos = rqos->next; } } - - return 0; - -err: - blk_mq_debugfs_unregister(q); - return -ENOMEM; } void blk_mq_debugfs_unregister(struct request_queue *q) @@ -894,52 +877,32 @@ void blk_mq_debugfs_unregister(struct request_queue *q) q->debugfs_dir = NULL; } -static int blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx) +static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) { struct dentry *ctx_dir; char name[20]; snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); - if (!ctx_dir) - return -ENOMEM; - if (!debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs)) - return -ENOMEM; - - return 0; + debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); } -int blk_mq_debugfs_register_hctx(struct request_queue *q, - struct blk_mq_hw_ctx *hctx) +void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; char name[20]; int i; - if (!q->debugfs_dir) - return -ENOENT; - snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); - if (!hctx->debugfs_dir) - return -ENOMEM; - - if (!debugfs_create_files(hctx->debugfs_dir, hctx, - blk_mq_debugfs_hctx_attrs)) - goto err; - - hctx_for_each_ctx(hctx, ctx, i) { - if (blk_mq_debugfs_register_ctx(hctx, ctx)) - goto err; - } - return 0; + debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); -err: - blk_mq_debugfs_unregister_hctx(hctx); - return -ENOMEM; + hctx_for_each_ctx(hctx, ctx, i) + blk_mq_debugfs_register_ctx(hctx, ctx); } void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) @@ -949,17 +912,13 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) hctx->debugfs_dir = NULL; } -int blk_mq_debugfs_register_hctxs(struct request_queue *q) +void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - if (blk_mq_debugfs_register_hctx(q, hctx)) - return -ENOMEM; - } - - return 0; + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_register_hctx(q, hctx); } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) @@ -971,29 +930,16 @@ void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) blk_mq_debugfs_unregister_hctx(hctx); } -int blk_mq_debugfs_register_sched(struct request_queue *q) +void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; - if (!q->debugfs_dir) - return -ENOENT; - if (!e->queue_debugfs_attrs) - return 0; + return; q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); - if (!q->sched_debugfs_dir) - return -ENOMEM; - if (!debugfs_create_files(q->sched_debugfs_dir, q, - e->queue_debugfs_attrs)) - goto err; - - return 0; - -err: - blk_mq_debugfs_unregister_sched(q); - return -ENOMEM; + debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs); } void blk_mq_debugfs_unregister_sched(struct request_queue *q) @@ -1008,36 +954,22 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) rqos->debugfs_dir = NULL; } -int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->q; const char *dir_name = rq_qos_id_to_name(rqos->id); - if (!q->debugfs_dir) - return -ENOENT; - if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) - return 0; + return; - if (!q->rqos_debugfs_dir) { + if (!q->rqos_debugfs_dir) q->rqos_debugfs_dir = debugfs_create_dir("rqos", q->debugfs_dir); - if (!q->rqos_debugfs_dir) - return -ENOMEM; - } rqos->debugfs_dir = debugfs_create_dir(dir_name, rqos->q->rqos_debugfs_dir); - if (!rqos->debugfs_dir) - return -ENOMEM; - if (!debugfs_create_files(rqos->debugfs_dir, rqos, - rqos->ops->debugfs_attrs)) - goto err; - return 0; - err: - blk_mq_debugfs_unregister_rqos(rqos); - return -ENOMEM; + debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) @@ -1046,27 +978,18 @@ void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) q->rqos_debugfs_dir = NULL; } -int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, - struct blk_mq_hw_ctx *hctx) +void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) { struct elevator_type *e = q->elevator->type; - if (!hctx->debugfs_dir) - return -ENOENT; - if (!e->hctx_debugfs_attrs) - return 0; + return; hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); - if (!hctx->sched_debugfs_dir) - return -ENOMEM; - - if (!debugfs_create_files(hctx->sched_debugfs_dir, hctx, - e->hctx_debugfs_attrs)) - return -ENOMEM; - - return 0; + debugfs_create_files(hctx->sched_debugfs_dir, hctx, + e->hctx_debugfs_attrs); } void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 8c9012a578c1..a68aa6041a10 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -18,74 +18,68 @@ struct blk_mq_debugfs_attr { int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); -int blk_mq_debugfs_register(struct request_queue *q); +void blk_mq_debugfs_register(struct request_queue *q); void blk_mq_debugfs_unregister(struct request_queue *q); -int blk_mq_debugfs_register_hctx(struct request_queue *q, - struct blk_mq_hw_ctx *hctx); +void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); -int blk_mq_debugfs_register_hctxs(struct request_queue *q); +void blk_mq_debugfs_register_hctxs(struct request_queue *q); void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); -int blk_mq_debugfs_register_sched(struct request_queue *q); +void blk_mq_debugfs_register_sched(struct request_queue *q); void blk_mq_debugfs_unregister_sched(struct request_queue *q); -int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, +void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); -int blk_mq_debugfs_register_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q); #else -static inline int blk_mq_debugfs_register(struct request_queue *q) +static inline void blk_mq_debugfs_register(struct request_queue *q) { - return 0; } static inline void blk_mq_debugfs_unregister(struct request_queue *q) { } -static inline int blk_mq_debugfs_register_hctx(struct request_queue *q, - struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) { - return 0; } static inline void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { } -static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q) +static inline void blk_mq_debugfs_register_hctxs(struct request_queue *q) { - return 0; } static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { } -static inline int blk_mq_debugfs_register_sched(struct request_queue *q) +static inline void blk_mq_debugfs_register_sched(struct request_queue *q) { - return 0; } static inline void blk_mq_debugfs_unregister_sched(struct request_queue *q) { } -static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, - struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) { - return 0; } static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { } -static inline int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { - return 0; } static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) -- cgit From 154085ffdcc6875851b80a5d2bd759bbf687b001 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jun 2019 15:10:17 -0700 Subject: null_blk: remove duplicate check for report zone This patch removes the check in the null_blk_zoned for report zone command, where it checks for the dev-,>zoned before executing the report zone. The null_zone_report() function is a block_device operation callback which is initialized in the null_blk_main.c and gets called as a part of blkdev for report zone IOCTL (BLKREPORTZONE). blkdev_ioctl() blkdev_report_zones_ioctl() blkdev_report_zones() blk_report_zones() disk->fops->report_zones() nullb_zone_report(); The null_zone_report() will never get executed on the non-zoned block device, in the non zoned block device blk_queue_is_zoned() will always be false which is first check the blkdev_report_zones_ioctl() before actual low level driver report zone callback is executed. Here is the detailed scenario:- 1. modprobe null_blk null_init null_alloc_dev dev->zoned = 0 null_add_dev dev->zoned == 0 so we don't set the q->limits.zoned = BLK_ZONED_HR 2. blkzone report /dev/nullb0 blkdev_ioctl() blkdev_report_zones_ioctl() blk_queue_is_zoned() blk_queue_is_zoned q->limits.zoned == 0 return false if (!blk_queue_is_zoned(q)) <--- true return -ENOTTY; Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 5d1c261a2cfd..fca0c97ff1aa 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -74,10 +74,6 @@ int null_zone_report(struct gendisk *disk, sector_t sector, struct nullb_device *dev = nullb->dev; unsigned int zno, nrz = 0; - if (!dev->zoned) - /* Not a zoned null device */ - return -EOPNOTSUPP; - zno = null_zone_no(dev, sector); if (zno < dev->nr_zones) { nrz = min_t(unsigned int, *nr_zones, dev->nr_zones - zno); -- cgit From 8614b0085d98482a065f0a308d715b9d4212aebf Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 12 Jun 2019 08:50:09 +0200 Subject: block/switching-sched.txt: Update to blk-mq schedulers Remove references to CFQ and legacy block layer which are gone. Update example with what's available under blk-mq. Signed-off-by: Andreas Herrmann Signed-off-by: Jens Axboe --- Documentation/block/switching-sched.txt | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt index 3b2612e342f1..7977f6fb8b20 100644 --- a/Documentation/block/switching-sched.txt +++ b/Documentation/block/switching-sched.txt @@ -13,11 +13,9 @@ you can do so by typing: # mount none /sys -t sysfs -As of the Linux 2.6.10 kernel, it is now possible to change the -IO scheduler for a given block device on the fly (thus making it possible, -for instance, to set the CFQ scheduler for the system default, but -set a specific device to use the deadline or noop schedulers - which -can improve that device's throughput). +It is possible to change the IO scheduler for a given block device on +the fly to select one of mq-deadline, none, bfq, or kyber schedulers - +which can improve that device's throughput. To set a specific scheduler, simply do this: @@ -30,8 +28,8 @@ The list of defined schedulers can be found by simply doing a "cat /sys/block/DEV/queue/scheduler" - the list of valid names will be displayed, with the currently selected scheduler in brackets: -# cat /sys/block/hda/queue/scheduler -noop deadline [cfq] -# echo deadline > /sys/block/hda/queue/scheduler -# cat /sys/block/hda/queue/scheduler -noop [deadline] cfq +# cat /sys/block/sda/queue/scheduler +[mq-deadline] kyber bfq none +# echo none >/sys/block/sda/queue/scheduler +# cat /sys/block/sda/queue/scheduler +[none] mq-deadline kyber bfq -- cgit From fb5772cbfe48575711bf789767d561582376f7f1 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 12 Jun 2019 08:17:32 +0200 Subject: blkio-controller.txt: Remove references to CFQ CFQ is gone. No need anymore to document its "proportional weight time based division of disk policy". Signed-off-by: Andreas Herrmann Signed-off-by: Jens Axboe --- Documentation/cgroup-v1/blkio-controller.txt | 96 ++-------------------------- 1 file changed, 7 insertions(+), 89 deletions(-) diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt index 673dc34d3f78..d1a1b7bdd03a 100644 --- a/Documentation/cgroup-v1/blkio-controller.txt +++ b/Documentation/cgroup-v1/blkio-controller.txt @@ -8,61 +8,13 @@ both at leaf nodes as well as at intermediate nodes in a storage hierarchy. Plan is to use the same cgroup based management interface for blkio controller and based on user options switch IO policies in the background. -Currently two IO control policies are implemented. First one is proportional -weight time based division of disk policy. It is implemented in CFQ. Hence -this policy takes effect only on leaf nodes when CFQ is being used. The second -one is throttling policy which can be used to specify upper IO rate limits -on devices. This policy is implemented in generic block layer and can be -used on leaf nodes as well as higher level logical devices like device mapper. +One IO control policy is throttling policy which can be used to +specify upper IO rate limits on devices. This policy is implemented in +generic block layer and can be used on leaf nodes as well as higher +level logical devices like device mapper. HOWTO ===== -Proportional Weight division of bandwidth ------------------------------------------ -You can do a very simple testing of running two dd threads in two different -cgroups. Here is what you can do. - -- Enable Block IO controller - CONFIG_BLK_CGROUP=y - -- Enable group scheduling in CFQ - CONFIG_CFQ_GROUP_IOSCHED=y - -- Compile and boot into kernel and mount IO controller (blkio); see - cgroups.txt, Why are cgroups needed?. - - mount -t tmpfs cgroup_root /sys/fs/cgroup - mkdir /sys/fs/cgroup/blkio - mount -t cgroup -o blkio none /sys/fs/cgroup/blkio - -- Create two cgroups - mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2 - -- Set weights of group test1 and test2 - echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight - echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight - -- Create two same size files (say 512MB each) on same disk (file1, file2) and - launch two dd threads in different cgroup to read those files. - - sync - echo 3 > /proc/sys/vm/drop_caches - - dd if=/mnt/sdb/zerofile1 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test1/tasks - cat /sys/fs/cgroup/blkio/test1/tasks - - dd if=/mnt/sdb/zerofile2 of=/dev/null & - echo $! > /sys/fs/cgroup/blkio/test2/tasks - cat /sys/fs/cgroup/blkio/test2/tasks - -- At macro level, first dd should finish first. To get more precise data, keep - on looking at (with the help of script), at blkio.disk_time and - blkio.disk_sectors files of both test1 and test2 groups. This will tell how - much disk time (in milliseconds), each group got and how many sectors each - group dispatched to the disk. We provide fairness in terms of disk time, so - ideally io.disk_time of cgroups should be in proportion to the weight. - Throttling/Upper Limit policy ----------------------------- - Enable Block IO controller @@ -94,7 +46,7 @@ Throttling/Upper Limit policy Hierarchical Cgroups ==================== -Both CFQ and throttling implement hierarchy support; however, +Throttling implements hierarchy support; however, throttling's hierarchy support is enabled iff "sane_behavior" is enabled from cgroup side, which currently is a development option and not publicly available. @@ -107,9 +59,8 @@ If somebody created a hierarchy like as follows. | test3 -CFQ by default and throttling with "sane_behavior" will handle the -hierarchy correctly. For details on CFQ hierarchy support, refer to -Documentation/block/cfq-iosched.txt. For throttling, all limits apply +Throttling with "sane_behavior" will handle the +hierarchy correctly. For throttling, all limits apply to the whole subtree while all statistics are local to the IOs directly generated by tasks in that cgroup. @@ -130,10 +81,6 @@ CONFIG_DEBUG_BLK_CGROUP - Debug help. Right now some additional stats file show up in cgroup if this option is enabled. -CONFIG_CFQ_GROUP_IOSCHED - - Enables group scheduling in CFQ. Currently only 1 level of group - creation is allowed. - CONFIG_BLK_DEV_THROTTLING - Enable block device throttling support in block layer. @@ -344,32 +291,3 @@ Common files among various policies - blkio.reset_stats - Writing an int to this file will result in resetting all the stats for that cgroup. - -CFQ sysfs tunable -================= -/sys/block//queue/iosched/slice_idle ------------------------------------------- -On a faster hardware CFQ can be slow, especially with sequential workload. -This happens because CFQ idles on a single queue and single queue might not -drive deeper request queue depths to keep the storage busy. In such scenarios -one can try setting slice_idle=0 and that would switch CFQ to IOPS -(IO operations per second) mode on NCQ supporting hardware. - -That means CFQ will not idle between cfq queues of a cfq group and hence be -able to driver higher queue depth and achieve better throughput. That also -means that cfq provides fairness among groups in terms of IOPS and not in -terms of disk time. - -/sys/block//queue/iosched/group_idle ------------------------------------------- -If one disables idling on individual cfq queues and cfq service trees by -setting slice_idle=0, group_idle kicks in. That means CFQ will still idle -on the group in an attempt to provide fairness among groups. - -By default group_idle is same as slice_idle and does not do anything if -slice_idle is enabled. - -One can experience an overall throughput drop if you have created multiple -groups and put applications in that group which are not driving enough -IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle -on individual groups and throughput should improve. -- cgit From c326f846ebc2a30eca386b85dffba96e23803d81 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 Jun 2019 17:31:53 +0800 Subject: blk-mq: remove WARN_ON(!q->elevator) from blk_mq_sched_free_requests blk_mq_sched_free_requests() may be called in failure path in which q->elevator may not be setup yet, so remove WARN_ON(!q->elevator) from blk_mq_sched_free_requests for avoiding the false positive. This function is actually safe to call in case of !q->elevator because hctx->sched_tags is checked. Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Yi Zhang Fixes: c3e2219216c9 ("block: free sched's request pool in blk_cleanup_queue") Reported-by: syzbot+b9d0d56867048c7bcfde@syzkaller.appspotmail.com Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 500cb04901cc..2766066a15db 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -555,7 +555,6 @@ void blk_mq_sched_free_requests(struct request_queue *q) int i; lockdep_assert_held(&q->sysfs_lock); - WARN_ON(!q->elevator); queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) -- cgit From 31b90956b124240aa8c63250243ae1a53585c5e2 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 10 Jun 2019 06:13:34 +0800 Subject: bcache: fix stack corruption by PRECEDING_KEY() Recently people report bcache code compiled with gcc9 is broken, one of the buggy behavior I observe is that two adjacent 4KB I/Os should merge into one but they don't. Finally it turns out to be a stack corruption caused by macro PRECEDING_KEY(). See how PRECEDING_KEY() is defined in bset.h, 437 #define PRECEDING_KEY(_k) \ 438 ({ \ 439 struct bkey *_ret = NULL; \ 440 \ 441 if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ 442 _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ 443 \ 444 if (!_ret->low) \ 445 _ret->high--; \ 446 _ret->low--; \ 447 } \ 448 \ 449 _ret; \ 450 }) At line 442, _ret points to address of a on-stack variable combined by KEY(), the life range of this on-stack variable is in line 442-446, once _ret is returned to bch_btree_insert_key(), the returned address points to an invalid stack address and this address is overwritten in the following called bch_btree_iter_init(). Then argument 'search' of bch_btree_iter_init() points to some address inside stackframe of bch_btree_iter_init(), exact address depends on how the compiler allocates stack space. Now the stack is corrupted. Fixes: 0eacac22034c ("bcache: PRECEDING_KEY()") Signed-off-by: Coly Li Reviewed-by: Rolf Fokkens Reviewed-by: Pierre JUHEN Tested-by: Shenghui Wang Tested-by: Pierre JUHEN Cc: Kent Overstreet Cc: Nix Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 16 +++++++++++++--- drivers/md/bcache/bset.h | 34 ++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 8f07fa6e1739..268f1b685084 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -887,12 +887,22 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; struct btree_iter iter; + struct bkey preceding_key_on_stack = ZERO_KEY; + struct bkey *preceding_key_p = &preceding_key_on_stack; BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); - m = bch_btree_iter_init(b, &iter, b->ops->is_extents - ? PRECEDING_KEY(&START_KEY(k)) - : PRECEDING_KEY(k)); + /* + * If k has preceding key, preceding_key_p will be set to address + * of k's preceding key; otherwise preceding_key_p will be set + * to NULL inside preceding_key(). + */ + if (b->ops->is_extents) + preceding_key(&START_KEY(k), &preceding_key_p); + else + preceding_key(k, &preceding_key_p); + + m = bch_btree_iter_init(b, &iter, preceding_key_p); if (b->ops->insert_fixup(b, k, &iter, replace_key)) return status; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index bac76aabca6d..c71365e7c1fa 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -434,20 +434,26 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) return __bch_cut_back(where, k); } -#define PRECEDING_KEY(_k) \ -({ \ - struct bkey *_ret = NULL; \ - \ - if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ - _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ - \ - if (!_ret->low) \ - _ret->high--; \ - _ret->low--; \ - } \ - \ - _ret; \ -}) +/* + * Pointer '*preceding_key_p' points to a memory object to store preceding + * key of k. If the preceding key does not exist, set '*preceding_key_p' to + * NULL. So the caller of preceding_key() needs to take care of memory + * which '*preceding_key_p' pointed to before calling preceding_key(). + * Currently the only caller of preceding_key() is bch_btree_insert_key(), + * and it points to an on-stack variable, so the memory release is handled + * by stackframe itself. + */ +static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p) +{ + if (KEY_INODE(k) || KEY_OFFSET(k)) { + (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0); + if (!(*preceding_key_p)->low) + (*preceding_key_p)->high--; + (*preceding_key_p)->low--; + } else { + (*preceding_key_p) = NULL; + } +} static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) { -- cgit From 1f0ffa67349c56ea54c03ccfd1e073c990e7411e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 10 Jun 2019 06:13:35 +0800 Subject: bcache: only set BCACHE_DEV_WB_RUNNING when cached device attached MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When people set a writeback percent via sysfs file, /sys/block/bcache/bcache/writeback_percent current code directly sets BCACHE_DEV_WB_RUNNING to dc->disk.flags and schedules kworker dc->writeback_rate_update. If there is no cache set attached to, the writeback kernel thread is not running indeed, running dc->writeback_rate_update does not make sense and may cause NULL pointer deference when reference cache set pointer inside update_writeback_rate(). This patch checks whether the cache set point (dc->disk.c) is NULL in sysfs interface handler, and only set BCACHE_DEV_WB_RUNNING and schedule dc->writeback_rate_update when dc->disk.c is not NULL (it means the cache device is attached to a cache set). This problem might be introduced from initial bcache commit, but commit 3fd47bfe55b0 ("bcache: stop dc->writeback_rate_update properly") changes part of the original code piece, so I add 'Fixes: 3fd47bfe55b0' to indicate from which commit this patch can be applied. Fixes: 3fd47bfe55b0 ("bcache: stop dc->writeback_rate_update properly") Reported-by: Bjørn Forsman Signed-off-by: Coly Li Reviewed-by: Bjørn Forsman Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 6cd44d3cf906..bfb437ffb13c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -431,8 +431,13 @@ STORE(bch_cached_dev) bch_writeback_queue(dc); } + /* + * Only set BCACHE_DEV_WB_RUNNING when cached device attached to + * a cache set, otherwise it doesn't make sense. + */ if (attr == &sysfs_writeback_percent) - if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + if ((dc->disk.c != NULL) && + (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))) schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); -- cgit From 31f6264e225fb92cf6f4b63031424f20797c297d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 11 Jun 2019 16:32:59 +0200 Subject: libata: Extend quirks for the ST1000LM024 drives with NOLPM quirk We've received a bugreport that using LPM with ST1000LM024 drives leads to system lockups. So it seems that these models are buggy in more then 1 way. Add NOLPM quirk to the existing quirks entry for BROKEN_FPDMA_AA. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1571330 Cc: stable@vger.kernel.org Reviewed-by: Martin K. Petersen Signed-off-by: Hans de Goede Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index aaa57e0c809d..4a2dff303865 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4460,9 +4460,12 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "ST3320[68]13AS", "SD1[5-9]", ATA_HORKAGE_NONCQ | ATA_HORKAGE_FIRMWARE_WARN }, - /* drives which fail FPDMA_AA activation (some may freeze afterwards) */ - { "ST1000LM024 HN-M101MBB", "2AR10001", ATA_HORKAGE_BROKEN_FPDMA_AA }, - { "ST1000LM024 HN-M101MBB", "2BA30001", ATA_HORKAGE_BROKEN_FPDMA_AA }, + /* drives which fail FPDMA_AA activation (some may freeze afterwards) + the ST disks also have LPM issues */ + { "ST1000LM024 HN-M101MBB", "2AR10001", ATA_HORKAGE_BROKEN_FPDMA_AA | + ATA_HORKAGE_NOLPM, }, + { "ST1000LM024 HN-M101MBB", "2BA30001", ATA_HORKAGE_BROKEN_FPDMA_AA | + ATA_HORKAGE_NOLPM, }, { "VB0250EAVER", "HPG7", ATA_HORKAGE_BROKEN_FPDMA_AA }, /* Blacklist entries taken from Silicon Image 3124/3132 -- cgit From 1d0c06513bd44e724f572ef9c932d0c889d183c6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 13 Jun 2019 09:30:06 +0200 Subject: block/ps3vram: Use %llu to format sector_t after LBDAF removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The removal of CONFIG_LBDAF changed the type of sector_t from "unsigned long" to "u64" aka "unsigned long long" on 64-bit platforms, leading to a compiler warning regression: drivers/block/ps3vram.c: In function ‘ps3vram_probe’: drivers/block/ps3vram.c:770:23: warning: format ‘%lu’ expects argument of type ‘long unsigned int’, but argument 4 has type ‘sector_t {aka long long unsigned int}’ [-Wformat=] Fix this by using "%llu" instead. Fixes: 72deb455b5ec619f ("block: remove CONFIG_LBDAF") Signed-off-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- drivers/block/ps3vram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 4c7f51b1eda9..4628e1a27a2b 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -767,7 +767,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); - dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n", + dev_info(&dev->core, "%s: Using %llu MiB of GPU memory\n", gendisk->disk_name, get_capacity(gendisk) >> 11); device_add_disk(&dev->core, gendisk, NULL); -- cgit