diff options
Diffstat (limited to 'block/blk-cgroup.c')
-rw-r--r-- | block/blk-cgroup.c | 216 |
1 files changed, 158 insertions, 58 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 059467086b13..5936db7f8475 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work) /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); - bio_list_merge(&bios, &blkg->async_bios); - bio_list_init(&blkg->async_bios); + bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ @@ -323,6 +322,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; + blkg->iostat.blkg = blkg; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); @@ -619,13 +619,47 @@ restart: spin_unlock_irq(&q->queue_lock); } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void __blkg_clear_stat(struct blkg_iostat_set *bis) +{ + struct blkg_iostat cur = {0}; + unsigned long flags; + + flags = u64_stats_update_begin_irqsave(&bis->sync); + blkg_iostat_set(&bis->cur, &cur); + blkg_iostat_set(&bis->last, &cur); + u64_stats_update_end_irqrestore(&bis->sync, flags); +} + +static void blkg_clear_stat(struct blkcg_gq *blkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); + + __blkg_clear_stat(s); + } + __blkg_clear_stat(&blkg->iostat); +} + static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i, cpu; + int i; + pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -635,18 +669,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - for_each_possible_cpu(cpu) { - struct blkg_iostat_set *bis = - per_cpu_ptr(blkg->iostat_cpu, cpu); - memset(bis, 0, sizeof(*bis)); - - /* Re-initialize the cleared blkg_iostat_set */ - u64_stats_init(&bis->sync); - bis->blkg = blkg; - } - memset(&blkg->iostat, 0, sizeof(blkg->iostat)); - u64_stats_init(&blkg->iostat.sync); - + blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -774,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) return -EINVAL; input = skip_spaces(input); - bdev = blkdev_get_no_open(MKDEV(major, minor)); + bdev = blkdev_get_no_open(MKDEV(major, minor), false); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { @@ -793,6 +816,41 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) ctx->bdev = bdev; return 0; } +/* + * Similar to blkg_conf_open_bdev, but additionally freezes the queue, + * acquires q->elevator_lock, and ensures the correct locking order + * between q->elevator_lock and q->rq_qos_mutex. + * + * This function returns negative error on failure. On success it returns + * memflags which must be saved and later passed to blkg_conf_exit_frozen + * for restoring the memalloc scope. + */ +unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) +{ + int ret; + unsigned long memflags; + + if (ctx->bdev) + return -EINVAL; + + ret = blkg_conf_open_bdev(ctx); + if (ret < 0) + return ret; + /* + * At this point, we haven’t started protecting anything related to QoS, + * so we release q->rq_qos_mutex here, which was first acquired in blkg_ + * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing + * the queue and acquiring q->elevator_lock to maintain the correct + * locking order. + */ + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + + memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); + mutex_lock(&ctx->bdev->bd_queue->elevator_lock); + mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); + + return memflags; +} /** * blkg_conf_prep - parse and prepare for per-blkg config update @@ -949,13 +1007,19 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_exit); -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +/* + * Similar to blkg_conf_exit, but also unfreezes the queue and releases + * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen + * is used to open the bdev. + */ +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) { - int i; + if (ctx->bdev) { + struct request_queue *q = ctx->bdev->bd_queue; - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; + blkg_conf_exit(ctx); + mutex_unlock(&q->elevator_lock); + blk_mq_unfreeze_queue(q, memflags); } } @@ -1010,8 +1074,8 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) /* * For covering concurrent parent blkg update from blkg_release(). * - * When flushing from cgroup, cgroup_rstat_lock is always held, so - * this lock won't cause contention most of time. + * When flushing from cgroup, the subsystem rstat lock is always held, + * so this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); @@ -1024,7 +1088,19 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) struct blkg_iostat cur; unsigned int seq; + /* + * Order assignment of `next_bisc` from `bisc->lnode.next` in + * llist_for_each_entry_safe and clearing `bisc->lqueued` for + * avoiding to assign `next_bisc` with new next pointer added + * in blk_cgroup_bio_start() in case of re-ordering. + * + * The pair barrier is implied in llist_add() in blk_cgroup_bio_start(). + */ + smp_mb(); + WRITE_ONCE(bisc->lqueued, false); + if (bisc == &blkg->iostat) + goto propagate_up; /* propagate up to parent only */ /* fetch the current per-cpu values */ do { @@ -1034,10 +1110,24 @@ static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) blkcg_iostat_update(blkg, &cur, &bisc->last); +propagate_up: /* propagate global delta to parent (unless that's root) */ - if (parent && parent->parent) + if (parent && parent->parent) { blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); + /* + * Queue parent->iostat to its blkcg's lockless + * list to propagate up to the grandparent if the + * iostat hasn't been queued yet. + */ + if (!parent->iostat.lqueued) { + struct llist_head *plhead; + + plhead = per_cpu_ptr(parent->blkcg->lhead, cpu); + llist_add(&parent->iostat.lnode, plhead); + parent->iostat.lqueued = true; + } + } } raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: @@ -1054,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no - * cgroups are defined. For that reason, cgroup_rstat_flush in + * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * @@ -1100,6 +1190,7 @@ static void blkcg_fill_root_iostats(void) blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } + class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) @@ -1162,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else - cgroup_rstat_flush(blkcg->css.cgroup); + css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -1286,10 +1377,14 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { + struct blkcg *parent; + if (!refcount_dec_and_test(&blkcg->online_pin)) break; + + parent = blkcg_parent(blkcg); blkcg_destroy_blkgs(blkcg); - blkcg = blkcg_parent(blkcg); + blkcg = parent; } while (blkcg); } @@ -1420,7 +1515,6 @@ int blkcg_init_disk(struct gendisk *disk) struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; - int ret; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1440,21 +1534,8 @@ int blkcg_init_disk(struct gendisk *disk) if (preloaded) radix_tree_preload_end(); - ret = blk_ioprio_init(disk); - if (ret) - goto err_destroy_all; - - ret = blk_throtl_init(disk); - if (ret) - goto err_ioprio_exit; - return 0; -err_ioprio_exit: - blk_ioprio_exit(disk); -err_destroy_all: - blkg_destroy_all(disk); - return ret; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) @@ -1517,13 +1598,22 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; + unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) return 0; + /* + * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, + * for example, ioprio. Such policy will work on blkcg level, not disk + * level, and don't need to be activated. + */ + if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) + return -EINVAL; + if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); @@ -1587,7 +1677,7 @@ retry: spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) @@ -1631,12 +1721,13 @@ void blkcg_deactivate_policy(struct gendisk *disk, { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; + unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); @@ -1660,7 +1751,7 @@ void blkcg_deactivate_policy(struct gendisk *disk, mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); @@ -1688,24 +1779,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ - ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + ret = -ENOSPC; goto err_unlock; } - /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ - if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) - goto err_unlock; - /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1716,8 +1810,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) + if (!cpd) { + ret = -ENOMEM; goto err_free_cpds; + } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; @@ -1728,12 +1824,15 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->dfl_cftypes) + if (pol->dfl_cftypes == pol->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + } else { WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); - if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); + } mutex_unlock(&blkcg_pol_register_mutex); return 0; @@ -2144,18 +2243,19 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + css_rstat_updated(&blkcg->css, cpu); put_cpu(); } bool blk_cgroup_congested(void) { - struct cgroup_subsys_state *css; + struct blkcg *blkcg; bool ret = false; rcu_read_lock(); - for (css = blkcg_css(); css; css = css->parent) { - if (atomic_read(&css->cgroup->congestion_count)) { + for (blkcg = css_to_blkcg(blkcg_css()); blkcg; + blkcg = blkcg_parent(blkcg)) { + if (atomic_read(&blkcg->congestion_count)) { ret = true; break; } |