diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 12 | ||||
-rw-r--r-- | block/bio.c | 19 | ||||
-rw-r--r-- | block/blk-cgroup.c | 123 | ||||
-rw-r--r-- | block/blk-core.c | 39 | ||||
-rw-r--r-- | block/blk-flush.c | 3 | ||||
-rw-r--r-- | block/blk-integrity.c | 2 | ||||
-rw-r--r-- | block/blk-lib.c | 2 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 99 | ||||
-rw-r--r-- | block/blk-mq-pci.c | 2 | ||||
-rw-r--r-- | block/blk-mq.c | 370 | ||||
-rw-r--r-- | block/blk-mq.h | 1 | ||||
-rw-r--r-- | block/blk-stat.c | 327 | ||||
-rw-r--r-- | block/blk-stat.h | 214 | ||||
-rw-r--r-- | block/blk-sysfs.c | 55 | ||||
-rw-r--r-- | block/blk-throttle.c | 977 | ||||
-rw-r--r-- | block/blk-wbt.c | 61 | ||||
-rw-r--r-- | block/blk-wbt.h | 12 | ||||
-rw-r--r-- | block/blk.h | 12 | ||||
-rw-r--r-- | block/cfq-iosched.c | 17 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 8 | ||||
-rw-r--r-- | block/sed-opal.c | 2 | ||||
-rw-r--r-- | block/t10-pi.c | 8 |
23 files changed, 1693 insertions, 674 deletions
diff --git a/block/Kconfig b/block/Kconfig index e9f780f815f5..89cd28f8d051 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING See Documentation/cgroups/blkio-controller.txt for more information. +config BLK_DEV_THROTTLING_LOW + bool "Block throttling .low limit interface support (EXPERIMENTAL)" + depends on BLK_DEV_THROTTLING + default n + ---help--- + Add .low limit interface for block throttling. The low limit is a best + effort limit to prioritize cgroups. Depending on the setting, the limit + can be used to protect cgroups in terms of bandwidth/iops and better + utilize disk resource. + + Note, this is an experimental interface and could be changed someday. + config BLK_CMDLINE_PARSER bool "Block device command line partition parser" default n diff --git a/block/bio.c b/block/bio.c index e75878f8b14a..f4d207180266 100644 --- a/block/bio.c +++ b/block/bio.c @@ -30,6 +30,7 @@ #include <linux/cgroup.h> #include <trace/events/block.h> +#include "blk.h" /* * Test patch to inline a certain number of bi_io_vec's inside the bio @@ -427,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * RETURNS: * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, + struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; unsigned front_pad; @@ -1824,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio) * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred * way to end I/O on a bio. No one should call bi_end_io() directly on a * bio unless they own it and thus know that it has an end_io function. + * + * bio_endio() can be called several times on a bio that has been chained + * using bio_chain(). The ->bi_end_io() function will only be called the + * last time. At this point the BLK_TA_COMPLETE tracing event will be + * generated if BIO_TRACE_COMPLETION is set. **/ void bio_endio(struct bio *bio) { @@ -1844,6 +1851,13 @@ again: goto again; } + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), + bio, bio->bi_error); + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + } + + blk_throtl_bio_endio(bio); if (bio->bi_end_io) bio->bi_end_io(bio); } @@ -1882,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_advance(bio, split->bi_iter.bi_size); + if (bio_flagged(bio, BIO_TRACE_COMPLETION)) + bio_set_flag(bio, BIO_TRACE_COMPLETION); + return split; } EXPORT_SYMBOL(bio_split); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bbe7ee00bd3d..7c2947128f58 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, } EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); +/* Performs queue bypass and policy enabled checks then looks up blkg. */ +static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, + const struct blkcg_policy *pol, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + + if (!blkcg_policy_enabled(q, pol)) + return ERR_PTR(-EOPNOTSUPP); + + /* + * This could be the first entry point of blkcg implementation and + * we shouldn't allow anything to go through for a bypassing queue. + */ + if (unlikely(blk_queue_bypass(q))) + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); + + return __blkg_lookup(blkcg, q, true /* update_hint */); +} + /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup @@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; + struct request_queue *q; struct blkcg_gq *blkg; struct module *owner; unsigned int major, minor; @@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (!disk) return -ENODEV; if (part) { - owner = disk->fops->owner; - put_disk(disk); - module_put(owner); - return -ENODEV; + ret = -ENODEV; + goto fail; } - rcu_read_lock(); - spin_lock_irq(disk->queue->queue_lock); + q = disk->queue; - if (blkcg_policy_enabled(disk->queue, pol)) - blkg = blkg_lookup_create(blkcg, disk->queue); - else - blkg = ERR_PTR(-EOPNOTSUPP); + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_check(blkcg, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); + goto fail_unlock; + } + + if (blkg) + goto success; + + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent; + struct blkcg_gq *new_blkg; + + parent = blkcg_parent(blkcg); + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } + + /* Drop locks to do new blkg allocation with GFP_KERNEL. */ + spin_unlock_irq(q->queue_lock); rcu_read_unlock(); - spin_unlock_irq(disk->queue->queue_lock); - owner = disk->fops->owner; - put_disk(disk); - module_put(owner); - /* - * If queue was bypassing, we should retry. Do so after a - * short msleep(). It isn't strictly necessary but queue - * can be bypassing for some time and it's always nice to - * avoid busy looping. - */ - if (ret == -EBUSY) { - msleep(10); - ret = restart_syscall(); + + new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + if (unlikely(!new_blkg)) { + ret = -ENOMEM; + goto fail; } - return ret; - } + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + + blkg = blkg_lookup_check(pos, pol, q); + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + goto fail_unlock; + } + + if (blkg) { + blkg_free(new_blkg); + } else { + blkg = blkg_create(pos, q, new_blkg); + if (unlikely(IS_ERR(blkg))) { + ret = PTR_ERR(blkg); + goto fail_unlock; + } + } + + if (pos == blkcg) + goto success; + } +success: ctx->disk = disk; ctx->blkg = blkg; ctx->body = body; return 0; + +fail_unlock: + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); +fail: + owner = disk->fops->owner; + put_disk(disk); + module_put(owner); + /* + * If queue was bypassing, we should retry. Do so after a + * short msleep(). It isn't strictly necessary but queue + * can be bypassing for some time and it's always nice to + * avoid busy looping. + */ + if (ret == -EBUSY) { + msleep(10); + ret = restart_syscall(); + } + return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); diff --git a/block/blk-core.c b/block/blk-core.c index d772c221cc17..8654aa0cef6d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -500,6 +500,13 @@ void blk_set_queue_dying(struct request_queue *q) queue_flag_set(QUEUE_FLAG_DYING, q); spin_unlock_irq(q->queue_lock); + /* + * When queue DYING flag is set, we need to block new req + * entering queue, so we call blk_freeze_queue_start() to + * prevent I/O from crossing blk_queue_enter(). + */ + blk_freeze_queue_start(q); + if (q->mq_ops) blk_mq_wake_waiters(q); else { @@ -669,6 +676,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait) if (nowait) return -EBUSY; + /* + * read pair of barrier in blk_freeze_queue_start(), + * we need to order reading __PERCPU_REF_DEAD flag of + * .q_usage_counter and reading .mq_freeze_depth or + * queue dying flag, otherwise the following wait may + * never return if the two reads are reordered. + */ + smp_rmb(); + ret = wait_event_interruptible(q->mq_freeze_wq, !atomic_read(&q->mq_freeze_depth) || blk_queue_dying(q)); @@ -720,6 +736,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q->backing_dev_info) goto fail_split; + q->stats = blk_alloc_queue_stats(); + if (!q->stats) + goto fail_stats; + q->backing_dev_info->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; @@ -776,6 +796,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: + blk_free_queue_stats(q->stats); +fail_stats: bdi_put(q->backing_dev_info); fail_split: bioset_free(q->bio_split); @@ -889,7 +911,6 @@ out_exit_flush_rq: q->exit_rq_fn(q, q->fq->flush_rq); out_free_flush_queue: blk_free_flush_queue(q->fq); - wbt_exit(q); return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1128,7 +1149,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - blk_rq_set_prio(rq, ioc); rq->cmd_flags = op; rq->rq_flags = rq_flags; @@ -1615,6 +1635,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->errors = 0; req->__sector = bio->bi_iter.bi_sector; + blk_rq_set_prio(req, rq_ioc(bio)); if (ioprio_valid(bio_prio(bio))) req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); @@ -1936,7 +1957,13 @@ generic_make_request_checks(struct bio *bio) if (!blkcg_bio_issue_check(q, bio)) return false; - trace_block_bio_queue(q, bio); + if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_queue(q, bio); + /* Now that enqueuing has been traced, we need to trace + * completion as well. + */ + bio_set_flag(bio, BIO_TRACE_COMPLETION); + } return true; not_supported: @@ -2478,7 +2505,7 @@ void blk_start_request(struct request *req) blk_dequeue_request(req); if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { - blk_stat_set_issue_time(&req->issue_stat); + blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req)); req->rq_flags |= RQF_STATS; wbt_issue(req->q->rq_wb, &req->issue_stat); } @@ -2601,6 +2628,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); req_bio_endio(req, bio, bio_bytes, error); total_bytes += bio_bytes; @@ -2699,7 +2728,7 @@ void blk_finish_request(struct request *req, int error) struct request_queue *q = req->q; if (req->rq_flags & RQF_STATS) - blk_stat_add(&q->rq_stats[rq_data_dir(req)], req); + blk_stat_add(req); if (req->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, req); diff --git a/block/blk-flush.c b/block/blk-flush.c index 0d5a9c1da1fc..4e951d3bf548 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq) * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. If WAIT flag is not passed then caller may check only what - * request was pushed in some internal queue for later handling. + * wish to. */ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 9f0ff5ba4f84..b3622cb00fc2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) return 0; } -static struct blk_integrity_profile nop_profile = { +static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, diff --git a/block/blk-lib.c b/block/blk-lib.c index ed1e78e24db0..e5b853f2b8a2 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard); * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_IFL_* flags to control behaviour + * @flags: BLKDEV_DISCARD_* flags to control behaviour * * Description: * Issue a discard request for the sectors in question. diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f6d917977b33..4b3f962a9c7a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -43,6 +43,42 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file, return ret; } +static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) +{ + if (stat->nr_samples) { + seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", + stat->nr_samples, stat->mean, stat->min, stat->max); + } else { + seq_puts(m, "samples=0"); + } +} + +static int queue_poll_stat_show(struct seq_file *m, void *v) +{ + struct request_queue *q = m->private; + + seq_puts(m, "read: "); + print_stat(m, &q->poll_stat[READ]); + seq_puts(m, "\n"); + + seq_puts(m, "write: "); + print_stat(m, &q->poll_stat[WRITE]); + seq_puts(m, "\n"); + return 0; +} + +static int queue_poll_stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, queue_poll_stat_show, inode->i_private); +} + +static const struct file_operations queue_poll_stat_fops = { + .open = queue_poll_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int hctx_state_show(struct seq_file *m, void *v) { struct blk_mq_hw_ctx *hctx = m->private; @@ -322,60 +358,6 @@ static const struct file_operations hctx_io_poll_fops = { .release = single_release, }; -static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) -{ - seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", - stat->nr_samples, stat->mean, stat->min, stat->max); -} - -static int hctx_stats_show(struct seq_file *m, void *v) -{ - struct blk_mq_hw_ctx *hctx = m->private; - struct blk_rq_stat stat[2]; - - blk_stat_init(&stat[BLK_STAT_READ]); - blk_stat_init(&stat[BLK_STAT_WRITE]); - - blk_hctx_stat_get(hctx, stat); - - seq_puts(m, "read: "); - print_stat(m, &stat[BLK_STAT_READ]); - seq_puts(m, "\n"); - - seq_puts(m, "write: "); - print_stat(m, &stat[BLK_STAT_WRITE]); - seq_puts(m, "\n"); - return 0; -} - -static int hctx_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, hctx_stats_show, inode->i_private); -} - -static ssize_t hctx_stats_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct seq_file *m = file->private_data; - struct blk_mq_hw_ctx *hctx = m->private; - struct blk_mq_ctx *ctx; - int i; - - hctx_for_each_ctx(hctx, ctx, i) { - blk_stat_init(&ctx->stat[BLK_STAT_READ]); - blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); - } - return count; -} - -static const struct file_operations hctx_stats_fops = { - .open = hctx_stats_open, - .read = seq_read, - .write = hctx_stats_write, - .llseek = seq_lseek, - .release = single_release, -}; - static int hctx_dispatched_show(struct seq_file *m, void *v) { struct blk_mq_hw_ctx *hctx = m->private; @@ -636,6 +618,11 @@ static const struct file_operations ctx_completed_fops = { .release = single_release, }; +static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { + {"poll_stat", 0400, &queue_poll_stat_fops}, + {}, +}; + static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, &hctx_state_fops}, {"flags", 0400, &hctx_flags_fops}, @@ -646,7 +633,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"sched_tags", 0400, &hctx_sched_tags_fops}, {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, {"io_poll", 0600, &hctx_io_poll_fops}, - {"stats", 0600, &hctx_stats_fops}, {"dispatched", 0600, &hctx_dispatched_fops}, {"queued", 0600, &hctx_queued_fops}, {"run", 0600, &hctx_run_fops}, @@ -753,6 +739,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q) if (!q->mq_debugfs_dir) goto err; + if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs)) + goto err; + queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_debugfs_register_hctx(q, hctx)) goto err; diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index 966c2169762e..0c3354cf3552 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -23,7 +23,7 @@ * @pdev: PCI device associated with @set. * * This function assumes the PCI device @pdev has at least as many available - * interrupt vetors as @set has queues. It will then queuery the vector + * interrupt vectors as @set has queues. It will then query the vector * corresponding to each queue for it's affinity mask and built queue mapping * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. diff --git a/block/blk-mq.c b/block/blk-mq.c index 572966f49596..724bcec0ca4f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -39,6 +39,9 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); +static void blk_mq_poll_stats_start(struct request_queue *q); +static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); + /* * Check if any of the ctx's have pending work in this hardware queue */ @@ -65,7 +68,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } -void blk_mq_freeze_queue_start(struct request_queue *q) +void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; @@ -75,7 +78,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) blk_mq_run_hw_queues(q, false); } } -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); +EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { @@ -105,7 +108,7 @@ void blk_freeze_queue(struct request_queue *q) * no blk_unfreeze_queue(), and blk_freeze_queue() is not * exported to drivers as the only user for unfreeze is blk_mq. */ - blk_mq_freeze_queue_start(q); + blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } @@ -431,15 +434,8 @@ static void blk_mq_ipi_complete_request(struct request *rq) static void blk_mq_stat_add(struct request *rq) { if (rq->rq_flags & RQF_STATS) { - /* - * We could rq->mq_ctx here, but there's less of a risk - * of races if we have the completion event add the stats - * to the local software queue. - */ - struct blk_mq_ctx *ctx; - - ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id()); - blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq); + blk_mq_poll_stats_start(rq->q); + blk_stat_add(rq); } } @@ -491,7 +487,7 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - blk_stat_set_issue_time(&rq->issue_stat); + blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq)); rq->rq_flags |= RQF_STATS; wbt_issue(q->rq_wb, &rq->issue_stat); } @@ -526,6 +522,15 @@ void blk_mq_start_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_start_request); +/* + * When we reach here because queue is busy, REQ_ATOM_COMPLETE + * flag isn't set yet, so there may be race with timeout handler, + * but given rq->deadline is just set in .queue_rq() under + * this situation, the race won't be possible in reality because + * rq->timeout should be set as big enough to cover the window + * between blk_mq_start_request() called from .queue_rq() and + * clearing REQ_ATOM_STARTED here. + */ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -666,7 +671,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) * just be ignored. This can happen due to the bitflag ordering. * Timeout first checks if STARTED is set, and if it is, assumes * the request is active. But if we race with completion, then - * we both flags will get cleared. So check here again, and ignore + * both flags will get cleared. So check here again, and ignore * a timeout event with a request that isn't active. */ if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) @@ -699,6 +704,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; + /* + * The rq being checked may have been freed and reallocated + * out already here, we avoid this race by checking rq->deadline + * and REQ_ATOM_COMPLETE flag together: + * + * - if rq->deadline is observed as new value because of + * reusing, the rq won't be timed out because of timing. + * - if rq->deadline is observed as previous value, + * REQ_ATOM_COMPLETE flag won't be cleared in reuse path + * because we put a barrier between setting rq->deadline + * and clearing the flag in blk_mq_start_request(), so + * this rq won't be timed out too. + */ if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) blk_mq_rq_timed_out(rq, reserved); @@ -727,7 +745,7 @@ static void blk_mq_timeout_work(struct work_struct *work) * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in - * blk_mq_freeze_queue_start, and the moment the last request is + * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ @@ -964,20 +982,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) { struct blk_mq_hw_ctx *hctx; struct request *rq; - LIST_HEAD(driver_list); - struct list_head *dptr; int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; if (list_empty(list)) return false; /* - * Start off with dptr being NULL, so we start the first request - * immediately, even if we have more pending. - */ - dptr = NULL; - - /* * Now process all the entries, sending them to the driver. */ errors = queued = 0; @@ -1009,7 +1019,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) list_del_init(&rq->queuelist); bd.rq = rq; - bd.list = dptr; /* * Flag last if we have no more requests, or if we have more @@ -1045,13 +1054,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; - - /* - * We've done the first request. If we have more than 1 - * left in the list, set dptr to defer issue. - */ - if (!dptr && list->next != list->prev) - dptr = &driver_list; } while (!list_empty(list)); hctx->dispatched[queued_to_index(queued)]++; @@ -1104,6 +1106,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { + might_sleep(); + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); @@ -1453,13 +1457,12 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, +static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, bool may_sleep) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, - .list = NULL, .last = 1 }; struct blk_mq_hw_ctx *hctx; @@ -1485,8 +1488,6 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, return; } - __blk_mq_requeue_request(rq); - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { *cookie = BLK_QC_T_NONE; rq->errors = -EIO; @@ -1494,22 +1495,36 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, return; } + __blk_mq_requeue_request(rq); insert: blk_mq_sched_insert_request(rq, false, true, false, may_sleep); } -/* - * Multiple hardware queue variant. This will not use per-process plugs, - * but will attempt to bypass the hctx queueing if we can go straight to - * hardware for SYNC IO. - */ +static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_qc_t *cookie) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + __blk_mq_try_issue_directly(rq, cookie, false); + rcu_read_unlock(); + } else { + unsigned int srcu_idx; + + might_sleep(); + + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + __blk_mq_try_issue_directly(rq, cookie, true); + srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + } +} + static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; - unsigned int request_count = 0, srcu_idx; + unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; @@ -1545,145 +1560,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) cookie = request_to_qc_t(data.hctx, rq); - if (unlikely(is_flush_fua)) { - if (q->elevator) - goto elv_insert; - blk_mq_bio_to_request(rq, bio); - blk_insert_flush(rq); - goto run_queue; - } - plug = current->plug; - /* - * If the driver supports defer issued based on 'last', then - * queue it up like normal since we can potentially save some - * CPU this way. - */ - if (((plug && !blk_queue_nomerges(q)) || is_sync) && - !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct request *old_rq = NULL; - + if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); - - /* - * We do limited plugging. If the bio can be merged, do that. - * Otherwise the existing request in the plug list will be - * issued. So the plug list will have one request at most - */ - if (plug) { - /* - * The plug list might get flushed before this. If that - * happens, same_queue_rq is invalid and plug list is - * empty - */ - if (same_queue_rq && !list_empty(&plug->mq_list)) { - old_rq = same_queue_rq; - list_del_init(&old_rq->queuelist); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - } else /* is_sync */ - old_rq = rq; - blk_mq_put_ctx(data.ctx); - if (!old_rq) - goto done; - - if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - blk_mq_try_issue_directly(old_rq, &cookie, false); - rcu_read_unlock(); + if (q->elevator) { + blk_mq_sched_insert_request(rq, false, true, true, + true); } else { - srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); - blk_mq_try_issue_directly(old_rq, &cookie, true); - srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); + blk_insert_flush(rq); + blk_mq_run_hw_queue(data.hctx, true); } - goto done; - } - - if (q->elevator) { -elv_insert: - blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua, true); - goto done; - } - if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { - /* - * For a SYNC request, send it to the hardware immediately. For - * an ASYNC request, just ensure that we run it later on. The - * latter allows for merging opportunities and more efficient - * dispatching. - */ -run_queue: - blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); - } - blk_mq_put_ctx(data.ctx); -done: - return cookie; -} - -/* - * Single hardware queue variant. This will attempt to use any per-process - * plug for merging and IO deferral. - */ -static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) -{ - const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_plug *plug; - unsigned int request_count = 0; - struct blk_mq_alloc_data data = { .flags = 0 }; - struct request *rq; - blk_qc_t cookie; - unsigned int wb_acct; - - blk_queue_bounce(q, &bio); - - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_io_error(bio); - return BLK_QC_T_NONE; - } - - blk_queue_split(q, &bio, q->bio_split); - - if (!is_flush_fua && !blk_queue_nomerges(q)) { - if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return BLK_QC_T_NONE; - } else - request_count = blk_plug_queued_count(q); - - if (blk_mq_sched_bio_merge(q, bio)) - return BLK_QC_T_NONE; - - wb_acct = wbt_wait(q->rq_wb, bio, NULL); - - trace_block_getrq(q, bio, bio->bi_opf); - - rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); - if (unlikely(!rq)) { - __wbt_done(q->rq_wb, wb_acct); - return BLK_QC_T_NONE; - } - - wbt_track(&rq->issue_stat, wb_acct); - - cookie = request_to_qc_t(data.hctx, rq); - - if (unlikely(is_flush_fua)) { - if (q->elevator) - goto elv_insert; - blk_mq_bio_to_request(rq, bio); - blk_insert_flush(rq); - goto run_queue; - } - - /* - * A task plug currently exists. Since this is completely lockless, - * utilize that to temporarily store requests until the task is - * either done or scheduled away. - */ - plug = current->plug; - if (plug) { + } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; blk_mq_bio_to_request(rq, bio); @@ -1694,13 +1581,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) */ if (list_empty(&plug->mq_list)) request_count = 0; + else if (blk_queue_nomerges(q)) + request_count = blk_plug_queued_count(q); + if (!request_count) trace_block_plug(q); else last = list_entry_rq(plug->mq_list.prev); - blk_mq_put_ctx(data.ctx); - if (request_count >= BLK_MAX_REQUEST_COUNT || (last && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); @@ -1708,30 +1596,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } list_add_tail(&rq->queuelist, &plug->mq_list); - return cookie; - } - - if (q->elevator) { -elv_insert: - blk_mq_put_ctx(data.ctx); + } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua, true); - goto done; - } - if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* - * For a SYNC request, send it to the hardware immediately. For - * an ASYNC request, just ensure that we run it later on. The - * latter allows for merging opportunities and more efficient - * dispatching. + * We do limited plugging. If the bio can be merged, do that. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most + * The plug list might get flushed before this. If that happens, + * the plug list is empty, and same_queue_rq is invalid. */ -run_queue: - blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); - } + if (list_empty(&plug->mq_list)) + same_queue_rq = NULL; + if (same_queue_rq) + list_del_init(&same_queue_rq->queuelist); + list_add_tail(&rq->queuelist, &plug->mq_list); + + blk_mq_put_ctx(data.ctx); + + if (same_queue_rq) + blk_mq_try_issue_directly(data.hctx, same_queue_rq, + &cookie); + + return cookie; + } else if (q->nr_hw_queues > 1 && is_sync) { + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_try_issue_directly(data.hctx, rq, &cookie); + return cookie; + } else if (q->elevator) { + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, true, true); + } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) + blk_mq_run_hw_queue(data.hctx, true); blk_mq_put_ctx(data.ctx); -done: return cookie; } @@ -2067,8 +1966,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; - blk_stat_init(&__ctx->stat[BLK_STAT_READ]); - blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) @@ -2364,6 +2261,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, + blk_stat_rq_ddir, 2, q); + if (!q->poll_cb) + goto err_exit; + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!q->queue_ctx) goto err_exit; @@ -2398,10 +2300,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - if (q->nr_hw_queues > 1) - blk_queue_make_request(q, blk_mq_make_request); - else - blk_queue_make_request(q, blk_sq_make_request); + blk_queue_make_request(q, blk_mq_make_request); /* * Do this after blk_queue_make_request() overrides it... @@ -2456,8 +2355,6 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); - wbt_exit(q); - blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); @@ -2502,7 +2399,7 @@ static void blk_mq_queue_reinit_work(void) * take place in parallel. */ list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_freeze_queue_start(q); + blk_freeze_queue_start(q); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_freeze_queue_wait(q); @@ -2755,16 +2652,6 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); - - /* - * Manually set the make_request_fn as blk_queue_make_request - * resets a lot of the queue settings. - */ - if (q->nr_hw_queues > 1) - q->make_request_fn = blk_mq_make_request; - else - q->make_request_fn = blk_sq_make_request; - blk_mq_queue_reinit(q, cpu_online_mask); } @@ -2773,28 +2660,53 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +/* Enable polling stats and return whether they were already enabled. */ +static bool blk_poll_stats_enable(struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + return true; + blk_stat_add_callback(q, q->poll_cb); + return false; +} + +static void blk_mq_poll_stats_start(struct request_queue *q) +{ + /* + * We don't arm the callback if polling stats are not enabled or the + * callback is already active. + */ + if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || + blk_stat_is_active(q->poll_cb)) + return; + + blk_stat_activate_msecs(q->poll_cb, 100); +} + +static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) +{ + struct request_queue *q = cb->data; + + if (cb->stat[READ].nr_samples) + q->poll_stat[READ] = cb->stat[READ]; + if (cb->stat[WRITE].nr_samples) + q->poll_stat[WRITE] = cb->stat[WRITE]; +} + static unsigned long blk_mq_poll_nsecs(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_rq_stat stat[2]; unsigned long ret = 0; /* * If stats collection isn't on, don't sleep but turn it on for * future users */ - if (!blk_stat_enable(q)) + if (!blk_poll_stats_enable(q)) return 0; /* - * We don't have to do this once per IO, should optimize this - * to just use the current window of stats until it changes - */ - memset(&stat, 0, sizeof(stat)); - blk_hctx_stat_get(hctx, stat); - - /* * As an optimistic guess, use half of the mean service time * for this type of request. We can (and should) make this smarter. * For instance, if the completion latencies are tight, we can @@ -2802,10 +2714,10 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, * important on devices where the completion latencies are longer * than ~10 usec. */ - if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples) - ret = (stat[BLK_STAT_READ].mean + 1) / 2; - else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples) - ret = (stat[BLK_STAT_WRITE].mean + 1) / 2; + if (req_op(rq) == REQ_OP_READ && q->poll_stat[READ].nr_samples) + ret = (q->poll_stat[READ].mean + 1) / 2; + else if (req_op(rq) == REQ_OP_WRITE && q->poll_stat[WRITE].nr_samples) + ret = (q->poll_stat[WRITE].mean + 1) / 2; return ret; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 660a17e1d033..7e6f2e467696 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -20,7 +20,6 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; - struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-stat.c b/block/blk-stat.c index 186fcb981e9b..e77ec52f5bb5 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -4,10 +4,33 @@ * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> +#include <linux/rculist.h> #include <linux/blk-mq.h> #include "blk-stat.h" #include "blk-mq.h" +#include "blk.h" + +#define BLK_RQ_STAT_BATCH 64 + +struct blk_queue_stats { + struct list_head callbacks; + spinlock_t lock; + bool enable_accounting; +}; + +unsigned int blk_stat_rq_ddir(const struct request *rq) +{ + return rq_data_dir(rq); +} +EXPORT_SYMBOL_GPL(blk_stat_rq_ddir); + +static void blk_stat_init(struct blk_rq_stat *stat) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->batch = stat->nr_batch = 0; +} static void blk_stat_flush_batch(struct blk_rq_stat *stat) { @@ -48,209 +71,183 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) dst->nr_samples += src->nr_samples; } -static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - uint64_t latest = 0; - int i, j, nr; - - blk_stat_init(&dst[BLK_STAT_READ]); - blk_stat_init(&dst[BLK_STAT_WRITE]); - - nr = 0; - do { - uint64_t newest = 0; - - queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); - blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); - - if (!ctx->stat[BLK_STAT_READ].nr_samples && - !ctx->stat[BLK_STAT_WRITE].nr_samples) - continue; - if (ctx->stat[BLK_STAT_READ].time > newest) - newest = ctx->stat[BLK_STAT_READ].time; - if (ctx->stat[BLK_STAT_WRITE].time > newest) - newest = ctx->stat[BLK_STAT_WRITE].time; - } - } + stat->min = min(stat->min, value); + stat->max = max(stat->max, value); - /* - * No samples - */ - if (!newest) - break; - - if (newest > latest) - latest = newest; - - queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - if (ctx->stat[BLK_STAT_READ].time == newest) { - blk_stat_sum(&dst[BLK_STAT_READ], - &ctx->stat[BLK_STAT_READ]); - nr++; - } - if (ctx->stat[BLK_STAT_WRITE].time == newest) { - blk_stat_sum(&dst[BLK_STAT_WRITE], - &ctx->stat[BLK_STAT_WRITE]); - nr++; - } - } - } - /* - * If we race on finding an entry, just loop back again. - * Should be very rare. - */ - } while (!nr); + if (stat->batch + value < stat->batch || + stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) + blk_stat_flush_batch(stat); - dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest; + stat->batch += value; + stat->nr_batch++; } -void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +void blk_stat_add(struct request *rq) { - if (q->mq_ops) - blk_mq_stat_get(q, dst); - else { - blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]); - blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]); - memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], - sizeof(struct blk_rq_stat)); - memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], - sizeof(struct blk_rq_stat)); + struct request_queue *q = rq->q; + struct blk_stat_callback *cb; + struct blk_rq_stat *stat; + int bucket; + s64 now, value; + + now = __blk_stat_time(ktime_to_ns(ktime_get())); + if (now < blk_stat_time(&rq->issue_stat)) + return; + + value = now - blk_stat_time(&rq->issue_stat); + + blk_throtl_stat_add(rq, value); + + rcu_read_lock(); + list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { + if (blk_stat_is_active(cb)) { + bucket = cb->bucket_fn(rq); + stat = &this_cpu_ptr(cb->cpu_stat)[bucket]; + __blk_stat_add(stat, value); + } } + rcu_read_unlock(); } -void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +static void blk_stat_timer_fn(unsigned long data) { - struct blk_mq_ctx *ctx; - unsigned int i, nr; - - nr = 0; - do { - uint64_t newest = 0; + struct blk_stat_callback *cb = (void *)data; + unsigned int bucket; + int cpu; - hctx_for_each_ctx(hctx, ctx, i) { - blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); - blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); + for (bucket = 0; bucket < cb->buckets; bucket++) + blk_stat_init(&cb->stat[bucket]); - if (!ctx->stat[BLK_STAT_READ].nr_samples && - !ctx->stat[BLK_STAT_WRITE].nr_samples) - continue; + for_each_online_cpu(cpu) { + struct blk_rq_stat *cpu_stat; - if (ctx->stat[BLK_STAT_READ].time > newest) - newest = ctx->stat[BLK_STAT_READ].time; - if (ctx->stat[BLK_STAT_WRITE].time > newest) - newest = ctx->stat[BLK_STAT_WRITE].time; + cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); + for (bucket = 0; bucket < cb->buckets; bucket++) { + blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); + blk_stat_init(&cpu_stat[bucket]); } + } - if (!newest) - break; - - hctx_for_each_ctx(hctx, ctx, i) { - if (ctx->stat[BLK_STAT_READ].time == newest) { - blk_stat_sum(&dst[BLK_STAT_READ], - &ctx->stat[BLK_STAT_READ]); - nr++; - } - if (ctx->stat[BLK_STAT_WRITE].time == newest) { - blk_stat_sum(&dst[BLK_STAT_WRITE], - &ctx->stat[BLK_STAT_WRITE]); - nr++; - } - } - /* - * If we race on finding an entry, just loop back again. - * Should be very rare, as the window is only updated - * occasionally - */ - } while (!nr); + cb->timer_fn(cb); } -static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +struct blk_stat_callback * +blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), + unsigned int (*bucket_fn)(const struct request *), + unsigned int buckets, void *data) { - stat->min = -1ULL; - stat->max = stat->nr_samples = stat->mean = 0; - stat->batch = stat->nr_batch = 0; - stat->time = time_now & BLK_STAT_NSEC_MASK; -} + struct blk_stat_callback *cb; -void blk_stat_init(struct blk_rq_stat *stat) -{ - __blk_stat_init(stat, ktime_to_ns(ktime_get())); -} + cb = kmalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return NULL; -static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) -{ - return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); + cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), + GFP_KERNEL); + if (!cb->stat) { + kfree(cb); + return NULL; + } + cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), + __alignof__(struct blk_rq_stat)); + if (!cb->cpu_stat) { + kfree(cb->stat); + kfree(cb); + return NULL; + } + + cb->timer_fn = timer_fn; + cb->bucket_fn = bucket_fn; + cb->data = data; + cb->buckets = buckets; + setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb); + + return cb; } +EXPORT_SYMBOL_GPL(blk_stat_alloc_callback); -bool blk_stat_is_current(struct blk_rq_stat *stat) +void blk_stat_add_callback(struct request_queue *q, + struct blk_stat_callback *cb) { - return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); + unsigned int bucket; + int cpu; + + for_each_possible_cpu(cpu) { + struct blk_rq_stat *cpu_stat; + + cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); + for (bucket = 0; bucket < cb->buckets; bucket++) + blk_stat_init(&cpu_stat[bucket]); + } + + spin_lock(&q->stats->lock); + list_add_tail_rcu(&cb->list, &q->stats->callbacks); + set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + spin_unlock(&q->stats->lock); } +EXPORT_SYMBOL_GPL(blk_stat_add_callback); -void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +void blk_stat_remove_callback(struct request_queue *q, + struct blk_stat_callback *cb) { - s64 now, value; + spin_lock(&q->stats->lock); + list_del_rcu(&cb->list); + if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) + clear_bit(QUEUE_FLAG_STATS, &q->queue_flags); + spin_unlock(&q->stats->lock); - now = __blk_stat_time(ktime_to_ns(ktime_get())); - if (now < blk_stat_time(&rq->issue_stat)) - return; - - if (!__blk_stat_is_current(stat, now)) - __blk_stat_init(stat, now); + del_timer_sync(&cb->timer); +} +EXPORT_SYMBOL_GPL(blk_stat_remove_callback); - value = now - blk_stat_time(&rq->issue_stat); - if (value > stat->max) - stat->max = value; - if (value < stat->min) - stat->min = value; +static void blk_stat_free_callback_rcu(struct rcu_head *head) +{ + struct blk_stat_callback *cb; - if (stat->batch + value < stat->batch || - stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) - blk_stat_flush_batch(stat); + cb = container_of(head, struct blk_stat_callback, rcu); + free_percpu(cb->cpu_stat); + kfree(cb->stat); + kfree(cb); +} - stat->batch += value; - stat->nr_batch++; +void blk_stat_free_callback(struct blk_stat_callback *cb) +{ + if (cb) + call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } +EXPORT_SYMBOL_GPL(blk_stat_free_callback); -void blk_stat_clear(struct request_queue *q) +void blk_stat_enable_accounting(struct request_queue *q) { - if (q->mq_ops) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - int i, j; - - queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - blk_stat_init(&ctx->stat[BLK_STAT_READ]); - blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); - } - } - } else { - blk_stat_init(&q->rq_stats[BLK_STAT_READ]); - blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]); - } + spin_lock(&q->stats->lock); + q->stats->enable_accounting = true; + set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + spin_unlock(&q->stats->lock); } -void blk_stat_set_issue_time(struct blk_issue_stat *stat) +struct blk_queue_stats *blk_alloc_queue_stats(void) { - stat->time = (stat->time & BLK_STAT_MASK) | - (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); + struct blk_queue_stats *stats; + + stats = kmalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + return NULL; + + INIT_LIST_HEAD(&stats->callbacks); + spin_lock_init(&stats->lock); + stats->enable_accounting = false; + + return stats; } -/* - * Enable stat tracking, return whether it was enabled - */ -bool blk_stat_enable(struct request_queue *q) +void blk_free_queue_stats(struct blk_queue_stats *stats) { - if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - set_bit(QUEUE_FLAG_STATS, &q->queue_flags); - return false; - } + if (!stats) + return; + + WARN_ON(!list_empty(&stats->callbacks)); - return true; + kfree(stats); } diff --git a/block/blk-stat.h b/block/blk-stat.h index a2050a0a5314..53f08a63bf15 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -1,33 +1,84 @@ #ifndef BLK_STAT_H #define BLK_STAT_H -/* - * ~0.13s window as a power-of-2 (2^27 nsecs) - */ -#define BLK_STAT_NSEC 134217728ULL -#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/ktime.h> +#include <linux/rcupdate.h> +#include <linux/timer.h> /* - * Upper 3 bits can be used elsewhere + * from upper: + * 3 bits: reserved for other usage + * 12 bits: size + * 49 bits: time */ #define BLK_STAT_RES_BITS 3 -#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) -#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) -#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK +#define BLK_STAT_SIZE_BITS 12 +#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS) +#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS) +#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1) +#define BLK_STAT_SIZE_MASK \ + (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT) +#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1)) + +/** + * struct blk_stat_callback - Block statistics callback. + * + * A &struct blk_stat_callback is associated with a &struct request_queue. While + * @timer is active, that queue's request completion latencies are sorted into + * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the + * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked. + */ +struct blk_stat_callback { + /* + * @list: RCU list of callbacks for a &struct request_queue. + */ + struct list_head list; + + /** + * @timer: Timer for the next callback invocation. + */ + struct timer_list timer; + + /** + * @cpu_stat: Per-cpu statistics buckets. + */ + struct blk_rq_stat __percpu *cpu_stat; -enum { - BLK_STAT_READ = 0, - BLK_STAT_WRITE, + /** + * @bucket_fn: Given a request, returns which statistics bucket it + * should be accounted under. + */ + unsigned int (*bucket_fn)(const struct request *); + + /** + * @buckets: Number of statistics buckets. + */ + unsigned int buckets; + + /** + * @stat: Array of statistics buckets. + */ + struct blk_rq_stat *stat; + + /** + * @fn: Callback function. + */ + void (*timer_fn)(struct blk_stat_callback *); + + /** + * @data: Private pointer for the user. + */ + void *data; + + struct rcu_head rcu; }; -void blk_stat_add(struct blk_rq_stat *, struct request *); -void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); -void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); -void blk_stat_clear(struct request_queue *); -void blk_stat_init(struct blk_rq_stat *); -bool blk_stat_is_current(struct blk_rq_stat *); -void blk_stat_set_issue_time(struct blk_issue_stat *); -bool blk_stat_enable(struct request_queue *); +struct blk_queue_stats *blk_alloc_queue_stats(void); +void blk_free_queue_stats(struct blk_queue_stats *); + +void blk_stat_add(struct request *); static inline u64 __blk_stat_time(u64 time) { @@ -36,7 +87,128 @@ static inline u64 __blk_stat_time(u64 time) static inline u64 blk_stat_time(struct blk_issue_stat *stat) { - return __blk_stat_time(stat->time); + return __blk_stat_time(stat->stat); +} + +static inline sector_t blk_capped_size(sector_t size) +{ + return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1); +} + +static inline sector_t blk_stat_size(struct blk_issue_stat *stat) +{ + return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT; +} + +static inline void blk_stat_set_issue(struct blk_issue_stat *stat, + sector_t size) +{ + stat->stat = (stat->stat & BLK_STAT_RES_MASK) | + (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) | + (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT); +} + +/* record time/size info in request but not add a callback */ +void blk_stat_enable_accounting(struct request_queue *q); + +/* + * blk_stat_rq_ddir() - Bucket callback function for the request data direction. + * @rq: Request. + * + * This is the same as rq_data_dir() but as a function so it can be used as + * @bucket_fn for blk_stat_alloc_callback(). + * + * Return: Data direction of the request, either READ or WRITE. + */ +unsigned int blk_stat_rq_ddir(const struct request *rq); + +/** + * blk_stat_alloc_callback() - Allocate a block statistics callback. + * @timer_fn: Timer callback function. + * @bucket_fn: Bucket callback function. + * @buckets: Number of statistics buckets. + * @data: Value for the @data field of the &struct blk_stat_callback. + * + * See &struct blk_stat_callback for details on the callback functions. + * + * Return: &struct blk_stat_callback on success or NULL on ENOMEM. + */ +struct blk_stat_callback * +blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), + unsigned int (*bucket_fn)(const struct request *), + unsigned int buckets, void *data); + +/** + * blk_stat_add_callback() - Add a block statistics callback to be run on a + * request queue. + * @q: The request queue. + * @cb: The callback. + * + * Note that a single &struct blk_stat_callback can only be added to a single + * &struct request_queue. + */ +void blk_stat_add_callback(struct request_queue *q, + struct blk_stat_callback *cb); + +/** + * blk_stat_remove_callback() - Remove a block statistics callback from a + * request queue. + * @q: The request queue. + * @cb: The callback. + * + * When this returns, the callback is not running on any CPUs and will not be + * called again unless readded. + */ +void blk_stat_remove_callback(struct request_queue *q, + struct blk_stat_callback *cb); + +/** + * blk_stat_free_callback() - Free a block statistics callback. + * @cb: The callback. + * + * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must + * not be associated with a request queue. I.e., if it was previously added with + * blk_stat_add_callback(), it must also have been removed since then with + * blk_stat_remove_callback(). + */ +void blk_stat_free_callback(struct blk_stat_callback *cb); + +/** + * blk_stat_is_active() - Check if a block statistics callback is currently + * gathering statistics. + * @cb: The callback. + */ +static inline bool blk_stat_is_active(struct blk_stat_callback *cb) +{ + return timer_pending(&cb->timer); +} + +/** + * blk_stat_activate_nsecs() - Gather block statistics during a time window in + * nanoseconds. + * @cb: The callback. + * @nsecs: Number of nanoseconds to gather statistics for. + * + * The timer callback will be called when the window expires. + */ +static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, + u64 nsecs) +{ + mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); +} + +/** + * blk_stat_activate_msecs() - Gather block statistics during a time window in + * milliseconds. + * @cb: The callback. + * @msecs: Number of milliseconds to gather statistics for. + * + * The timer callback will be called when the window expires. + */ +static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb, + unsigned int msecs) +{ + mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); } #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 37f0b3ad635e..c47db43a40cc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } -static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) -{ - return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", - pre, (long long) stat->nr_samples, - (long long) stat->mean, (long long) stat->min, - (long long) stat->max); -} - -static ssize_t queue_stats_show(struct request_queue *q, char *page) -{ - struct blk_rq_stat stat[2]; - ssize_t ret; - - blk_queue_stat_get(q, stat); - - ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); - ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); - return ret; -} - static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = { .show = queue_dax_show, }; -static struct queue_sysfs_entry queue_stats_entry = { - .attr = {.name = "stats", .mode = S_IRUGO }, - .show = queue_stats_show, -}; - static struct queue_sysfs_entry queue_wb_lat_entry = { .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, .show = queue_wb_lat_show, .store = queue_wb_lat_store, }; +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static struct queue_sysfs_entry throtl_sample_time_entry = { + .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR }, + .show = blk_throtl_sample_time_show, + .store = blk_throtl_sample_time_store, +}; +#endif + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = { &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_dax_entry.attr, - &queue_stats_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + &throtl_sample_time_entry.attr, +#endif NULL, }; @@ -810,7 +795,9 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); - wbt_exit(q); + if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + blk_stat_remove_callback(q, q->poll_cb); + blk_stat_free_callback(q->poll_cb); bdi_put(q->backing_dev_info); blkcg_exit_queue(q); @@ -819,6 +806,8 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q, q->elevator); } + blk_free_queue_stats(q->stats); + blk_exit_rl(&q->root_rl); if (q->queue_tags) @@ -881,6 +870,11 @@ int blk_register_queue(struct gendisk *disk) if (WARN_ON(!q)) return -ENXIO; + WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), + "%s is registering an already registered queue\n", + kobject_name(&dev->kobj)); + queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); + /* * SCSI probing may synchronously create and destroy a lot of * request_queues for non-existent devices. Shutting down a fully @@ -916,6 +910,8 @@ int blk_register_queue(struct gendisk *disk) blk_wb_init(q); + blk_throtl_register_queue(q); + if (q->request_fn || (q->mq_ops && q->elevator)) { ret = elv_register_queue(q); if (ret) { @@ -939,6 +935,11 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); + + wbt_exit(q); + + if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8fab716e4059..c82bf9b1fe72 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8; /* Total max dispatch from all groups in one round */ static int throtl_quantum = 32; -/* Throttling is performed over 100ms slice and after that slice is renewed */ -static unsigned long throtl_slice = HZ/10; /* 100 ms */ +/* Throttling is performed over a slice and after that slice is renewed */ +#define DFL_THROTL_SLICE_HD (HZ / 10) +#define DFL_THROTL_SLICE_SSD (HZ / 50) +#define MAX_THROTL_SLICE (HZ) +#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ +#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ +#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ +/* default latency target is 0, eg, guarantee IO latency by default */ +#define DFL_LATENCY_TARGET (0) + +#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) static struct blkcg_policy blkcg_policy_throtl; @@ -83,6 +92,12 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) +enum { + LIMIT_LOW, + LIMIT_MAX, + LIMIT_CNT, +}; + struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -119,20 +134,54 @@ struct throtl_grp { /* are there any throtl rules between this group and td? */ bool has_rules[2]; - /* bytes per second rate limits */ - uint64_t bps[2]; + /* internally used bytes per second rate limits */ + uint64_t bps[2][LIMIT_CNT]; + /* user configured bps limits */ + uint64_t bps_conf[2][LIMIT_CNT]; - /* IOPS limits */ - unsigned int iops[2]; + /* internally used IOPS limits */ + unsigned int iops[2][LIMIT_CNT]; + /* user configured IOPS limits */ + unsigned int iops_conf[2][LIMIT_CNT]; /* Number of bytes disptached in current slice */ uint64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; + unsigned long last_low_overflow_time[2]; + + uint64_t last_bytes_disp[2]; + unsigned int last_io_disp[2]; + + unsigned long last_check_time; + + unsigned long latency_target; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; + + unsigned long last_finish_time; /* ns / 1024 */ + unsigned long checked_last_finish_time; /* ns / 1024 */ + unsigned long avg_idletime; /* ns / 1024 */ + unsigned long idletime_threshold; /* us */ + + unsigned int bio_cnt; /* total bios */ + unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ + unsigned long bio_cnt_reset_time; +}; + +/* We measure latency for request size from <= 4k to >= 1M */ +#define LATENCY_BUCKET_SIZE 9 + +struct latency_bucket { + unsigned long total_latency; /* ns / 1024 */ + int samples; +}; + +struct avg_latency_bucket { + unsigned long latency; /* ns / 1024 */ + bool valid; }; struct throtl_data @@ -145,8 +194,26 @@ struct throtl_data /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; + unsigned int throtl_slice; + /* Work for dispatching throttled bios */ struct work_struct dispatch_work; + unsigned int limit_index; + bool limit_valid[LIMIT_CNT]; + + unsigned long dft_idletime_threshold; /* us */ + + unsigned long low_upgrade_time; + unsigned long low_downgrade_time; + + unsigned int scale; + + struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets; + unsigned long last_calculate_time; + + bool track_bio_latency; }; static void throtl_pending_timer_fn(unsigned long arg); @@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) return container_of(sq, struct throtl_data, service_queue); } +/* + * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to + * make the IO dispatch more smooth. + * Scale up: linearly scale up according to lapsed time since upgrade. For + * every throtl_slice, the limit scales up 1/2 .low limit till the + * limit hits .max limit + * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit + */ +static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) +{ + /* arbitrary value to avoid too big scale */ + if (td->scale < 4096 && time_after_eq(jiffies, + td->low_upgrade_time + td->scale * td->throtl_slice)) + td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; + + return low + (low >> 1) * td->scale; +} + +static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + uint64_t ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return U64_MAX; + + td = tg->td; + ret = tg->bps[rw][td->limit_index]; + if (ret == 0 && td->limit_index == LIMIT_LOW) + return tg->bps[rw][LIMIT_MAX]; + + if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && + tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); + ret = min(tg->bps[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + unsigned int ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return UINT_MAX; + td = tg->td; + ret = tg->iops[rw][td->limit_index]; + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) + return tg->iops[rw][LIMIT_MAX]; + + if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && + tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); + if (adjusted > UINT_MAX) + adjusted = UINT_MAX; + ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +#define request_bucket_index(sectors) \ + clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) + /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported @@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) } RB_CLEAR_NODE(&tg->rb_node); - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; + tg->bps[READ][LIMIT_MAX] = U64_MAX; + tg->bps[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops[READ][LIMIT_MAX] = UINT_MAX; + tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; + tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; + tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; + tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; + /* LIMIT_LOW will have default value 0 */ + + tg->latency_target = DFL_LATENCY_TARGET; return &tg->pd; } @@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; + + tg->idletime_threshold = td->dft_idletime_threshold; } /* @@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd) static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); + struct throtl_data *td = tg->td; int rw; for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || - (tg->bps[rw] != -1 || tg->iops[rw] != -1); + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX || + tg_iops_limit(tg, rw) != UINT_MAX)); } static void throtl_pd_online(struct blkg_policy_data *pd) { + struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(pd_to_tg(pd)); + tg_update_has_rules(tg); +} + +static void blk_throtl_update_limit_valid(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + bool low_valid = false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || + tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + low_valid = true; + } + rcu_read_unlock(); + + td->limit_valid[LIMIT_LOW] = low_valid; +} + +static void throtl_upgrade_state(struct throtl_data *td); +static void throtl_pd_offline(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + + blk_throtl_update_limit_valid(tg->td); + + if (!tg->td->limit_valid[tg->td->limit_index]) + throtl_upgrade_state(tg->td); } static void throtl_pd_free(struct blkg_policy_data *pd) @@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { + unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice; + + /* + * Since we are adjusting the throttle limit dynamically, the sleep + * time calculated according to previous limit might be invalid. It's + * possible the cgroup sleep time is very long and no other cgroups + * have IO running so notify the limit changes. Make sure the cgroup + * doesn't sleep too long to avoid the missed notification. + */ + if (time_after(expires, max_expire)) + expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); @@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, if (time_after_eq(start, tg->slice_start[rw])) tg->slice_start[rw] = start; - tg->slice_end[rw] = jiffies + throtl_slice; + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; - tg->slice_end[rw] = jiffies + throtl_slice; + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + throtl_slice); + throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = jiffies - tg->slice_start[rw]; - nr_slices = time_elapsed / throtl_slice; + nr_slices = time_elapsed / tg->td->throtl_slice; if (!nr_slices) return; - tmp = tg->bps[rw] * throtl_slice * nr_slices; + tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; do_div(tmp, HZ); bytes_trim = tmp; - io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; + io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / + HZ; if (!bytes_trim && !io_trim) return; @@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) else tg->io_disp[rw] = 0; - tg->slice_start[rw] += nr_slices * throtl_slice; + tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", @@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; + jiffy_elapsed_rnd = tg->td->throtl_slice; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); /* * jiffy_elapsed_rnd should not be a big value as minimum iops can be @@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, * have been trimmed. */ - tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, } /* Calc approx time to dispatch */ - jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; if (jiffy_wait > jiffy_elapsed) jiffy_wait = jiffy_wait - jiffy_elapsed; @@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; + jiffy_elapsed_rnd = tg->td->throtl_slice; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - tmp = tg->bps[rw] * jiffy_elapsed_rnd; + tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); bytes_allowed = tmp; @@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); if (!jiffy_wait) jiffy_wait = 1; @@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (tg_bps_limit(tg, rw) == U64_MAX && + tg_iops_limit(tg, rw) == UINT_MAX) { if (wait) *wait = 0; return true; @@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw); else { - if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) - throtl_extend_slice(tg, rw, jiffies + throtl_slice); + if (time_before(tg->slice_end[rw], + jiffies + tg->td->throtl_slice)) + throtl_extend_slice(tg, rw, + jiffies + tg->td->throtl_slice); } if (tg_with_in_bps_limit(tg, bio, &bps_wait) && @@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_iter.bi_size; tg->io_disp[rw]++; + tg->last_bytes_disp[rw] += bio->bi_iter.bi_size; + tg->last_io_disp[rw]++; /* * BIO_THROTTLED is used to prevent the same bio to be throttled @@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) return nr_disp; } +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg); /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @arg: the throtl_service_queue being serviced @@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg) int ret; spin_lock_irq(q->queue_lock); + if (throtl_can_upgrade(td, NULL)) + throtl_upgrade_state(td); + again: parent_sq = sq->parent_sq; dispatched = false; @@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); - if (v == -1) + if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } @@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); - if (v == -1) + if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } @@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg) throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", - tg->bps[READ], tg->bps[WRITE], - tg->iops[READ], tg->iops[WRITE]); + tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), + tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); /* * Update has_rules[] flags for the updated tg's subtree. A tg is @@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) - v = -1; + v = U64_MAX; tg = blkg_to_tg(ctx.blkg); @@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", - .private = offsetof(struct throtl_grp, bps[READ]), + .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", - .private = offsetof(struct throtl_grp, bps[WRITE]), + .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", - .private = offsetof(struct throtl_grp, iops[READ]), + .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", - .private = offsetof(struct throtl_grp, iops[WRITE]), + .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, @@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = { { } /* terminate */ }; -static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, +static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); char bufs[4][21] = { "max", "max", "max", "max" }; + u64 bps_dft; + unsigned int iops_dft; + char idle_time[26] = ""; + char latency_time[26] = ""; if (!dname) return 0; - if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && - tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + + if (off == LIMIT_LOW) { + bps_dft = 0; + iops_dft = 0; + } else { + bps_dft = U64_MAX; + iops_dft = UINT_MAX; + } + + if (tg->bps_conf[READ][off] == bps_dft && + tg->bps_conf[WRITE][off] == bps_dft && + tg->iops_conf[READ][off] == iops_dft && + tg->iops_conf[WRITE][off] == iops_dft && + (off != LIMIT_LOW || + (tg->idletime_threshold == tg->td->dft_idletime_threshold && + tg->latency_target == DFL_LATENCY_TARGET))) return 0; - if (tg->bps[READ] != -1) - snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); - if (tg->bps[WRITE] != -1) - snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); - if (tg->iops[READ] != -1) - snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); - if (tg->iops[WRITE] != -1) - snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); - - seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", - dname, bufs[0], bufs[1], bufs[2], bufs[3]); + if (tg->bps_conf[READ][off] != bps_dft) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", + tg->bps_conf[READ][off]); + if (tg->bps_conf[WRITE][off] != bps_dft) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", + tg->bps_conf[WRITE][off]); + if (tg->iops_conf[READ][off] != iops_dft) + snprintf(bufs[2], sizeof(bufs[2]), "%u", + tg->iops_conf[READ][off]); + if (tg->iops_conf[WRITE][off] != iops_dft) + snprintf(bufs[3], sizeof(bufs[3]), "%u", + tg->iops_conf[WRITE][off]); + if (off == LIMIT_LOW) { + if (tg->idletime_threshold == ULONG_MAX) + strcpy(idle_time, " idle=max"); + else + snprintf(idle_time, sizeof(idle_time), " idle=%lu", + tg->idletime_threshold); + + if (tg->latency_target == ULONG_MAX) + strcpy(latency_time, " latency=max"); + else + snprintf(latency_time, sizeof(latency_time), + " latency=%lu", tg->latency_target); + } + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, + latency_time); return 0; } -static int tg_print_max(struct seq_file *sf, void *v) +static int tg_print_limit(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } -static ssize_t tg_set_max(struct kernfs_open_file *of, +static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; + unsigned long idle_time; + unsigned long latency_time; int ret; + int index = of_cft(of)->private; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) @@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, tg = blkg_to_tg(ctx.blkg); - v[0] = tg->bps[READ]; - v[1] = tg->bps[WRITE]; - v[2] = tg->iops[READ]; - v[3] = tg->iops[WRITE]; + v[0] = tg->bps_conf[READ][index]; + v[1] = tg->bps_conf[WRITE][index]; + v[2] = tg->iops_conf[READ][index]; + v[3] = tg->iops_conf[WRITE][index]; + idle_time = tg->idletime_threshold; + latency_time = tg->latency_target; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; - u64 val = -1; + u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) @@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); + else if (off == LIMIT_LOW && !strcmp(tok, "idle")) + idle_time = val; + else if (off == LIMIT_LOW && !strcmp(tok, "latency")) + latency_time = val; else goto out_finish; } - tg->bps[READ] = v[0]; - tg->bps[WRITE] = v[1]; - tg->iops[READ] = v[2]; - tg->iops[WRITE] = v[3]; + tg->bps_conf[READ][index] = v[0]; + tg->bps_conf[WRITE][index] = v[1]; + tg->iops_conf[READ][index] = v[2]; + tg->iops_conf[WRITE][index] = v[3]; + if (index == LIMIT_MAX) { + tg->bps[READ][index] = v[0]; + tg->bps[WRITE][index] = v[1]; + tg->iops[READ][index] = v[2]; + tg->iops[WRITE][index] = v[3]; + } + tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], + tg->bps_conf[READ][LIMIT_MAX]); + tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], + tg->bps_conf[WRITE][LIMIT_MAX]); + tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], + tg->iops_conf[READ][LIMIT_MAX]); + tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], + tg->iops_conf[WRITE][LIMIT_MAX]); + + if (index == LIMIT_LOW) { + blk_throtl_update_limit_valid(tg->td); + if (tg->td->limit_valid[LIMIT_LOW]) + tg->td->limit_index = LIMIT_LOW; + tg->idletime_threshold = (idle_time == ULONG_MAX) ? + ULONG_MAX : idle_time; + tg->latency_target = (latency_time == ULONG_MAX) ? + ULONG_MAX : latency_time; + } tg_conf_updated(tg); ret = 0; out_finish: @@ -1365,11 +1641,21 @@ out_finish: } static struct cftype throtl_files[] = { +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_LOW, + }, +#endif { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = tg_print_max, - .write = tg_set_max, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_MAX, }, { } /* terminate */ }; @@ -1388,9 +1674,362 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, + .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; +static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) +{ + unsigned long rtime = jiffies, wtime = jiffies; + + if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) + rtime = tg->last_low_overflow_time[READ]; + if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + wtime = tg->last_low_overflow_time[WRITE]; + return min(rtime, wtime); +} + +/* tg should not be an intermediate node */ +static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) +{ + struct throtl_service_queue *parent_sq; + struct throtl_grp *parent = tg; + unsigned long ret = __tg_last_low_overflow_time(tg); + + while (true) { + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + if (!parent) + break; + + /* + * The parent doesn't have low limit, it always reaches low + * limit. Its overflow time is useless for children + */ + if (!parent->bps[READ][LIMIT_LOW] && + !parent->iops[READ][LIMIT_LOW] && + !parent->bps[WRITE][LIMIT_LOW] && + !parent->iops[WRITE][LIMIT_LOW]) + continue; + if (time_after(__tg_last_low_overflow_time(parent), ret)) + ret = __tg_last_low_overflow_time(parent); + } + return ret; +} + +static bool throtl_tg_is_idle(struct throtl_grp *tg) +{ + /* + * cgroup is idle if: + * - single idle is too long, longer than a fixed value (in case user + * configure a too big threshold) or 4 times of slice + * - average think time is more than threshold + * - IO latency is largely below threshold + */ + unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); + + time = min_t(unsigned long, MAX_IDLE_TIME, time); + return (ktime_get_ns() >> 10) - tg->last_finish_time > time || + tg->avg_idletime > tg->idletime_threshold || + (tg->latency_target && tg->bio_cnt && + tg->bad_bio_cnt * 5 < tg->bio_cnt); +} + +static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + bool read_limit, write_limit; + + /* + * if cgroup reaches low limit (if low limit is 0, the cgroup always + * reaches), it's ok to upgrade to next limit + */ + read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; + write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; + if (!read_limit && !write_limit) + return true; + if (read_limit && sq->nr_queued[READ] && + (!write_limit || sq->nr_queued[WRITE])) + return true; + if (write_limit && sq->nr_queued[WRITE] && + (!read_limit || sq->nr_queued[READ])) + return true; + + if (time_after_eq(jiffies, + tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && + throtl_tg_is_idle(tg)) + return true; + return false; +} + +static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) +{ + while (true) { + if (throtl_tg_can_upgrade(tg)) + return true; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + return false; + } + return false; +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + if (td->limit_index != LIMIT_LOW) + return false; + + if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) + return false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg == this_tg) + continue; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + continue; + if (!throtl_hierarchy_can_upgrade(tg)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + return true; +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_LOW) + return; + + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + tg->last_check_time = now; + + if (!time_after_eq(now, + __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) + return; + + if (throtl_can_upgrade(tg->td, NULL)) + throtl_upgrade_state(tg->td); +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->scale = 0; + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + tg->disptime = jiffies - 1; + throtl_select_dispatch(sq); + throtl_schedule_next_dispatch(sq, false); + } + rcu_read_unlock(); + throtl_select_dispatch(&td->service_queue); + throtl_schedule_next_dispatch(&td->service_queue, false); + queue_work(kthrotld_workqueue, &td->dispatch_work); +} + +static void throtl_downgrade_state(struct throtl_data *td, int new) +{ + td->scale /= 2; + + if (td->scale) { + td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; + return; + } + + td->limit_index = new; + td->low_downgrade_time = jiffies; +} + +static bool throtl_tg_can_downgrade(struct throtl_grp *tg) +{ + struct throtl_data *td = tg->td; + unsigned long now = jiffies; + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && + time_after_eq(now, tg_last_low_overflow_time(tg) + + td->throtl_slice) && + (!throtl_tg_is_idle(tg) || + !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) + return true; + return false; +} + +static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) +{ + while (true) { + if (!throtl_tg_can_downgrade(tg)) + return false; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + break; + } + return true; +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ + uint64_t bps; + unsigned int iops; + unsigned long elapsed_time; + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_MAX || + !tg->td->limit_valid[LIMIT_LOW]) + return; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + return; + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + elapsed_time = now - tg->last_check_time; + tg->last_check_time = now; + + if (time_before(now, tg_last_low_overflow_time(tg) + + tg->td->throtl_slice)) + return; + + if (tg->bps[READ][LIMIT_LOW]) { + bps = tg->last_bytes_disp[READ] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->bps[WRITE][LIMIT_LOW]) { + bps = tg->last_bytes_disp[WRITE] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + if (tg->iops[READ][LIMIT_LOW]) { + iops = tg->last_io_disp[READ] * HZ / elapsed_time; + if (iops >= tg->iops[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->iops[WRITE][LIMIT_LOW]) { + iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; + if (iops >= tg->iops[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (throtl_hierarchy_can_downgrade(tg)) + throtl_downgrade_state(tg->td, LIMIT_LOW); + + tg->last_bytes_disp[READ] = 0; + tg->last_bytes_disp[WRITE] = 0; + tg->last_io_disp[READ] = 0; + tg->last_io_disp[WRITE] = 0; +} + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ + unsigned long now = ktime_get_ns() >> 10; + unsigned long last_finish_time = tg->last_finish_time; + + if (now <= last_finish_time || last_finish_time == 0 || + last_finish_time == tg->checked_last_finish_time) + return; + + tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; + tg->checked_last_finish_time = last_finish_time; +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_update_latency_buckets(struct throtl_data *td) +{ + struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; + int i, cpu; + unsigned long last_latency = 0; + unsigned long latency; + + if (!blk_queue_nonrot(td->queue)) + return; + if (time_before(jiffies, td->last_calculate_time + HZ)) + return; + td->last_calculate_time = jiffies; + + memset(avg_latency, 0, sizeof(avg_latency)); + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets, cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } + + if (tmp->samples >= 32) { + int samples = tmp->samples; + + latency = tmp->total_latency; + + tmp->total_latency = 0; + tmp->samples = 0; + latency /= samples; + if (latency == 0) + continue; + avg_latency[i].latency = latency; + } + } + + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[i].latency) { + if (td->avg_buckets[i].latency < last_latency) + td->avg_buckets[i].latency = last_latency; + continue; + } + + if (!td->avg_buckets[i].valid) + latency = avg_latency[i].latency; + else + latency = (td->avg_buckets[i].latency * 7 + + avg_latency[i].latency) >> 3; + + td->avg_buckets[i].latency = max(latency, last_latency); + td->avg_buckets[i].valid = true; + last_latency = td->avg_buckets[i].latency; + } +} +#else +static inline void throtl_update_latency_buckets(struct throtl_data *td) +{ +} +#endif + bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { @@ -1399,6 +2038,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; + struct throtl_data *td = tg->td; + int ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -1408,19 +2049,40 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, spin_lock_irq(q->queue_lock); + throtl_update_latency_buckets(td); + if (unlikely(blk_queue_bypass(q))) goto out_unlock; + ret = bio_associate_current(bio); +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + if (ret == 0 || ret == -EBUSY) + bio->bi_cg_private = tg; + blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); +#endif + blk_throtl_update_idletime(tg); + sq = &tg->service_queue; +again: while (true) { + if (tg->last_low_overflow_time[rw] == 0) + tg->last_low_overflow_time[rw] = jiffies; + throtl_downgrade_check(tg); + throtl_upgrade_check(tg); /* throtl is FIFO - if bios are already queued, should queue */ if (sq->nr_queued[rw]) break; /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) + if (!tg_may_dispatch(tg, bio, NULL)) { + tg->last_low_overflow_time[rw] = jiffies; + if (throtl_can_upgrade(td, tg)) { + throtl_upgrade_state(td); + goto again; + } break; + } /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); @@ -1453,12 +2115,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', - tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], - tg->io_disp[rw], tg->iops[rw], + tg->bytes_disp[rw], bio->bi_iter.bi_size, + tg_bps_limit(tg, rw), + tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); - bio_associate_current(bio); - tg->td->nr_queued[rw]++; + tg->last_low_overflow_time[rw] = jiffies; + + td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; @@ -1483,9 +2147,94 @@ out: */ if (!throttled) bio_clear_flag(bio, BIO_THROTTLED); + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + if (throttled || !td->track_bio_latency) + bio->bi_issue_stat.stat |= SKIP_LATENCY; +#endif return throttled; } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_track_latency(struct throtl_data *td, sector_t size, + int op, unsigned long time) +{ + struct latency_bucket *latency; + int index; + + if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || + !blk_queue_nonrot(td->queue)) + return; + + index = request_bucket_index(size); + + latency = get_cpu_ptr(td->latency_buckets); + latency[index].total_latency += time; + latency[index].samples++; + put_cpu_ptr(td->latency_buckets); +} + +void blk_throtl_stat_add(struct request *rq, u64 time_ns) +{ + struct request_queue *q = rq->q; + struct throtl_data *td = q->td; + + throtl_track_latency(td, blk_stat_size(&rq->issue_stat), + req_op(rq), time_ns >> 10); +} + +void blk_throtl_bio_endio(struct bio *bio) +{ + struct throtl_grp *tg; + u64 finish_time_ns; + unsigned long finish_time; + unsigned long start_time; + unsigned long lat; + + tg = bio->bi_cg_private; + if (!tg) + return; + bio->bi_cg_private = NULL; + + finish_time_ns = ktime_get_ns(); + tg->last_finish_time = finish_time_ns >> 10; + + start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; + finish_time = __blk_stat_time(finish_time_ns) >> 10; + if (!start_time || finish_time <= start_time) + return; + + lat = finish_time - start_time; + /* this is only for bio based driver */ + if (!(bio->bi_issue_stat.stat & SKIP_LATENCY)) + throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), + bio_op(bio), lat); + + if (tg->latency_target) { + int bucket; + unsigned int threshold; + + bucket = request_bucket_index( + blk_stat_size(&bio->bi_issue_stat)); + threshold = tg->td->avg_buckets[bucket].latency + + tg->latency_target; + if (lat > threshold) + tg->bad_bio_cnt++; + /* + * Not race free, could get wrong count, which means cgroups + * will be throttled + */ + tg->bio_cnt++; + } + + if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { + tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; + tg->bio_cnt /= 2; + tg->bad_bio_cnt /= 2; + } +} +#endif + /* * Dispatch all bios from all children tg's queued on @parent_sq. On * return, @parent_sq is guaranteed to not have any active children tg's @@ -1558,6 +2307,12 @@ int blk_throtl_init(struct request_queue *q) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; + td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets) { + kfree(td); + return -ENOMEM; + } INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); @@ -1565,10 +2320,17 @@ int blk_throtl_init(struct request_queue *q) q->td = td; td->queue = q; + td->limit_valid[LIMIT_MAX] = true; + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->low_downgrade_time = jiffies; + /* activate policy */ ret = blkcg_activate_policy(q, &blkcg_policy_throtl); - if (ret) + if (ret) { + free_percpu(td->latency_buckets); kfree(td); + } return ret; } @@ -1577,9 +2339,74 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!q->td); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); + free_percpu(q->td->latency_buckets); kfree(q->td); } +void blk_throtl_register_queue(struct request_queue *q) +{ + struct throtl_data *td; + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + td = q->td; + BUG_ON(!td); + + if (blk_queue_nonrot(q)) { + td->throtl_slice = DFL_THROTL_SLICE_SSD; + td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; + } else { + td->throtl_slice = DFL_THROTL_SLICE_HD; + td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; + } +#ifndef CONFIG_BLK_DEV_THROTTLING_LOW + /* if no low limit, use previous default */ + td->throtl_slice = DFL_THROTL_SLICE_HD; +#endif + + td->track_bio_latency = !q->mq_ops && !q->request_fn; + if (!td->track_bio_latency) + blk_stat_enable_accounting(q); + + /* + * some tg are created before queue is fully initialized, eg, nonrot + * isn't initialized yet + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + tg->idletime_threshold = td->dft_idletime_threshold; + } + rcu_read_unlock(); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) +{ + if (!q->td) + return -EINVAL; + return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); +} + +ssize_t blk_throtl_sample_time_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long v; + unsigned long t; + + if (!q->td) + return -EINVAL; + if (kstrtoul(page, 10, &v)) + return -EINVAL; + t = msecs_to_jiffies(v); + if (t == 0 || t > MAX_THROTL_SLICE) + return -EINVAL; + q->td->throtl_slice = t; + return count; +} +#endif + static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 1aedb1f7ee0c..ffa80e11cf14 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) * that it's writes impacting us, and not just some sole read on * a device that is in a lower power state. */ - return stat[BLK_STAT_READ].nr_samples >= 1 && - stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; + return (stat[READ].nr_samples >= 1 && + stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); } static u64 rwb_sync_issue_lat(struct rq_wb *rwb) @@ -277,7 +277,7 @@ enum { LAT_EXCEEDED, }; -static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { struct backing_dev_info *bdi = rwb->queue->backing_dev_info; u64 thislat; @@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) */ thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec || - (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { + (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED; } @@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) * waited or still has writes in flights, consider us doing * just writes as well. */ - if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || - wb_recent_wait(rwb) || wbt_inflight(rwb)) + if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || + wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; } @@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) /* * If the 'min' latency exceeds our target, step down. */ - if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { - trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); + if (stat[READ].min > rwb->min_lat_nsec) { + trace_wbt_lat(bdi, stat[READ].min); trace_wbt_stat(bdi, stat); return LAT_EXCEEDED; } @@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_OK; } -static int latency_exceeded(struct rq_wb *rwb) -{ - struct blk_rq_stat stat[2]; - - blk_queue_stat_get(rwb->queue, stat); - return __latency_exceeded(rwb, stat); -} - static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { struct backing_dev_info *bdi = rwb->queue->backing_dev_info; @@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb) rwb->scale_step--; rwb->unknown_cnt = 0; - blk_stat_clear(rwb->queue); rwb->scaled_max = calc_wb_limits(rwb); @@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) rwb->scaled_max = false; rwb->unknown_cnt = 0; - blk_stat_clear(rwb->queue); calc_wb_limits(rwb); rwb_trace_step(rwb, "step down"); } static void rwb_arm_timer(struct rq_wb *rwb) { - unsigned long expires; - if (rwb->scale_step > 0) { /* * We should speed this up, using some variant of a fast @@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb) rwb->cur_win_nsec = rwb->win_nsec; } - expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); - mod_timer(&rwb->window_timer, expires); + blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); } -static void wb_timer_fn(unsigned long data) +static void wb_timer_fn(struct blk_stat_callback *cb) { - struct rq_wb *rwb = (struct rq_wb *) data; + struct rq_wb *rwb = cb->data; unsigned int inflight = wbt_inflight(rwb); int status; - status = latency_exceeded(rwb); + status = latency_exceeded(rwb, cb->stat); trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, inflight); @@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) __wbt_wait(rwb, bio->bi_opf, lock); - if (!timer_pending(&rwb->window_timer)) + if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); if (current_is_kswapd()) @@ -675,7 +662,7 @@ void wbt_disable_default(struct request_queue *q) struct rq_wb *rwb = q->rq_wb; if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { - del_timer_sync(&rwb->window_timer); + blk_stat_remove_callback(q, rwb->cb); rwb->win_nsec = rwb->min_lat_nsec = 0; wbt_update_limits(rwb); } @@ -699,24 +686,23 @@ int wbt_init(struct request_queue *q) struct rq_wb *rwb; int i; - /* - * For now, we depend on the stats window being larger than - * our monitoring window. Ensure that this isn't inadvertently - * violated. - */ - BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return -ENOMEM; + rwb->cb = blk_stat_alloc_callback(wb_timer_fn, blk_stat_rq_ddir, 2, rwb); + if (!rwb->cb) { + kfree(rwb); + return -ENOMEM; + } + for (i = 0; i < WBT_NUM_RWQ; i++) { atomic_set(&rwb->rq_wait[i].inflight, 0); init_waitqueue_head(&rwb->rq_wait[i].wait); } - setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); rwb->wc = 1; rwb->queue_depth = RWB_DEF_DEPTH; rwb->last_comp = rwb->last_issue = jiffies; @@ -726,10 +712,10 @@ int wbt_init(struct request_queue *q) wbt_update_limits(rwb); /* - * Assign rwb, and turn on stats tracking for this queue + * Assign rwb and add the stats callback. */ q->rq_wb = rwb; - blk_stat_enable(q); + blk_stat_add_callback(q, rwb->cb); rwb->min_lat_nsec = wbt_default_latency_nsec(q); @@ -744,7 +730,8 @@ void wbt_exit(struct request_queue *q) struct rq_wb *rwb = q->rq_wb; if (rwb) { - del_timer_sync(&rwb->window_timer); + blk_stat_remove_callback(q, rwb->cb); + blk_stat_free_callback(rwb->cb); q->rq_wb = NULL; kfree(rwb); } diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 65f1de519f67..ad6c78507c3a 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -32,27 +32,27 @@ enum { static inline void wbt_clear_state(struct blk_issue_stat *stat) { - stat->time &= BLK_STAT_TIME_MASK; + stat->stat &= ~BLK_STAT_RES_MASK; } static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) { - return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT; + return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT; } static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) { - stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT; + stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT; } static inline bool wbt_is_tracked(struct blk_issue_stat *stat) { - return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED; + return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED; } static inline bool wbt_is_read(struct blk_issue_stat *stat) { - return (stat->time >> BLK_STAT_SHIFT) & WBT_READ; + return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ; } struct rq_wait { @@ -81,7 +81,7 @@ struct rq_wb { u64 win_nsec; /* default window size */ u64 cur_win_nsec; /* current window size */ - struct timer_list window_timer; + struct blk_stat_callback *cb; s64 sync_issue; void *sync_cookie; diff --git a/block/blk.h b/block/blk.h index d1ea4bd9b9a3..07d375183f31 100644 --- a/block/blk.h +++ b/block/blk.h @@ -319,10 +319,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); +extern void blk_throtl_register_queue(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_register_queue(struct request_queue *q) { } #endif /* CONFIG_BLK_DEV_THROTTLING */ +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); +extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, + const char *page, size_t count); +extern void blk_throtl_bio_endio(struct bio *bio); +extern void blk_throtl_stat_add(struct request *rq, u64 time); +#else +static inline void blk_throtl_bio_endio(struct bio *bio) { } +static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } +#endif #endif /* BLK_INTERNAL_H */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 440b95ee593c..da69b079725f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) +static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; uint64_t serial_nr; - bool nonroot_cg; rcu_read_lock(); serial_nr = bio_blkcg(bio)->css.serial_nr; - nonroot_cg = bio_blkcg(bio) != &blkcg_root; rcu_read_unlock(); /* @@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) * spuriously on a newly created cic but there's no harm. */ if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) - return nonroot_cg; + return; /* * Drop reference to queues. New queues will be assigned in new @@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) } cic->blkcg_serial_nr = serial_nr; - return nonroot_cg; } #else -static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) +static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { - return false; } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - bool disable_wbt; spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); - disable_wbt = check_blkcg_changed(cic, bio); + check_blkcg_changed(cic, bio); new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { @@ -4491,9 +4486,6 @@ new_queue: rq->elv.priv[1] = cfqq->cfqg; spin_unlock_irq(q->queue_lock); - if (disable_wbt) - wbt_disable_default(q); - return 0; } @@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q) */ if (blk_queue_nonrot(q)) cfqd->cfq_slice_idle = 0; + wbt_disable_default(q); } /* diff --git a/block/genhd.c b/block/genhd.c index a9c516a8b37d..510aac1486cb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1352,7 +1352,7 @@ struct kobject *get_disk(struct gendisk *disk) owner = disk->fops->owner; if (owner && !try_module_get(owner)) return NULL; - kobj = kobject_get(&disk_to_dev(disk)->kobj); + kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); if (kobj == NULL) { module_put(owner); return NULL; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 2a2fc768b27a..82a43bb19967 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out_free_cdb; bio = rq->bio; - rq->retries = 0; + req->retries = 0; start_time = jiffies; @@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error; /* default. possible overriden later */ - rq->retries = 5; + req->retries = 5; switch (opcode) { case SEND_DIAGNOSTIC: case FORMAT_UNIT: rq->timeout = FORMAT_UNIT_TIMEOUT; - rq->retries = 1; + req->retries = 1; break; case START_STOP: rq->timeout = START_STOP_TIMEOUT; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; case READ_DEFECT_DATA: rq->timeout = READ_DEFECT_DATA_TIMEOUT; - rq->retries = 1; + req->retries = 1; break; default: rq->timeout = BLK_DEFAULT_SG_TIMEOUT; diff --git a/block/sed-opal.c b/block/sed-opal.c index 14035f826b5e..6736c7873d4a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1831,7 +1831,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev) /* 0x08 is Manufacured Inactive */ /* 0x09 is Manufactured */ if (lc_status != OPAL_MANUFACTURED_INACTIVE) { - pr_err("Couldn't determine the status of the Lifcycle state\n"); + pr_err("Couldn't determine the status of the Lifecycle state\n"); return -ENODEV; } diff --git a/block/t10-pi.c b/block/t10-pi.c index 2c97912335a9..680c6d636298 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, 3); } -struct blk_integrity_profile t10_pi_type1_crc = { +const struct blk_integrity_profile t10_pi_type1_crc = { .name = "T10-DIF-TYPE1-CRC", .generate_fn = t10_pi_type1_generate_crc, .verify_fn = t10_pi_type1_verify_crc, }; EXPORT_SYMBOL(t10_pi_type1_crc); -struct blk_integrity_profile t10_pi_type1_ip = { +const struct blk_integrity_profile t10_pi_type1_ip = { .name = "T10-DIF-TYPE1-IP", .generate_fn = t10_pi_type1_generate_ip, .verify_fn = t10_pi_type1_verify_ip, }; EXPORT_SYMBOL(t10_pi_type1_ip); -struct blk_integrity_profile t10_pi_type3_crc = { +const struct blk_integrity_profile t10_pi_type3_crc = { .name = "T10-DIF-TYPE3-CRC", .generate_fn = t10_pi_type3_generate_crc, .verify_fn = t10_pi_type3_verify_crc, }; EXPORT_SYMBOL(t10_pi_type3_crc); -struct blk_integrity_profile t10_pi_type3_ip = { +const struct blk_integrity_profile t10_pi_type3_ip = { .name = "T10-DIF-TYPE3-IP", .generate_fn = t10_pi_type3_generate_ip, .verify_fn = t10_pi_type3_verify_ip, |