diff options
Diffstat (limited to 'block/blk-cgroup.c')
| -rw-r--r-- | block/blk-cgroup.c | 1334 |
1 files changed, 822 insertions, 512 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 575d7a2e7203..3cffb68ba5d8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -23,15 +23,18 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> -#include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> -#include <linux/blk-cgroup.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <linux/psi.h> +#include <linux/part_stat.h> #include "blk.h" +#include "blk-cgroup.h" #include "blk-ioprio.h" +#include "blk-throttle.h" + +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -54,48 +57,129 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; -static struct workqueue_struct *blkcg_punt_bio_wq; + +static DEFINE_RAW_SPINLOCK(blkg_stat_lock); #define BLKG_DESTROY_BATCH_SIZE 64 -static bool blkcg_policy_enabled(struct request_queue *q, - const struct blkcg_policy *pol) +/* + * Lockless lists for tracking IO stats update + * + * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). + * There are multiple blkg's (one for each block device) attached to each + * blkcg. The rstat code keeps track of which cpu has IO stats updated, + * but it doesn't know which blkg has the updated stats. If there are many + * block devices in a system, the cost of iterating all the blkg's to flush + * out the IO stats can be high. To reduce such overhead, a set of percpu + * lockless lists (lhead) per blkcg are used to track the set of recently + * updated iostat_cpu's since the last flush. An iostat_cpu will be put + * onto the lockless list on the update side [blk_cgroup_bio_start()] if + * not there yet and then removed when being flushed [blkcg_rstat_flush()]. + * References to blkg are gotten and then put back in the process to + * protect against blkg removal. + * + * Return: 0 if successful or -ENOMEM if allocation fails. + */ +static int init_blkcg_llists(struct blkcg *blkcg) { - return pol && test_bit(pol->plid, q->blkcg_pols); + int cpu; + + blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); + if (!blkcg->lhead) + return -ENOMEM; + + for_each_possible_cpu(cpu) + init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); + return 0; } /** - * blkg_free - free a blkg - * @blkg: blkg to free + * blkcg_css - find the current css * - * Free @blkg which may be partially allocated. + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. */ -static void blkg_free(struct blkcg_gq *blkg) +static struct cgroup_subsys_state *blkcg_css(void) { - int i; + struct cgroup_subsys_state *css; - if (!blkg) - return; + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} + +static void blkg_free_workfn(struct work_struct *work) +{ + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + free_work); + struct request_queue *q = blkg->q; + int i; + /* + * pd_free_fn() can also be called from blkcg_deactivate_policy(), + * in order to make sure pd_free_fn() is called in order, the deletion + * of the list blkg->q_node is delayed to here from blkg_destroy(), and + * blkcg_mutex is used to synchronize blkg_free_workfn() and + * blkcg_deactivate_policy(). + */ + mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + if (blkg->parent) + blkg_put(blkg->parent); + spin_lock_irq(&q->queue_lock); + list_del_init(&blkg->q_node); + spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); + blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkcg_gq *blkg) +{ + if (!blkg) + return; + + /* + * Both ->pd_free_fn() and request queue's release handler may + * sleep, so free us by scheduling one work func + */ + INIT_WORK(&blkg->free_work, blkg_free_workfn); + schedule_work(&blkg->free_work); +} + static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + struct blkcg *blkcg = blkg->blkcg; + int cpu; +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); +#endif + /* + * Flush all the non-empty percpu lockless lists before releasing + * us, given these stat belongs to us. + * + * blkg_stat_lock is for serializing blkg stat update + */ + for_each_possible_cpu(cpu) + __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); blkg_free(blkg); } @@ -114,6 +198,9 @@ static void blkg_release(struct percpu_ref *ref) call_rcu(&blkg->rcu_head, __blkg_release); } +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO +static struct workqueue_struct *blkcg_punt_bio_wq; + static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, @@ -124,10 +211,9 @@ static void blkg_async_bio_workfn(struct work_struct *work) bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ - spin_lock_bh(&blkg->async_bio_lock); - bio_list_merge(&bios, &blkg->async_bios); - bio_list_init(&blkg->async_bios); - spin_unlock_bh(&blkg->async_bio_lock); + spin_lock(&blkg->async_bio_lock); + bio_list_merge_init(&bios, &blkg->async_bios); + spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { @@ -140,107 +226,156 @@ static void blkg_async_bio_workfn(struct work_struct *work) blk_finish_plug(&plug); } +/* + * When a shared kthread issues a bio for a cgroup, doing so synchronously can + * lead to priority inversions as the kthread can be trapped waiting for that + * cgroup. Use this helper instead of submit_bio to punt the actual issuing to + * a dedicated per-blkcg work item to avoid such priority inversions. + */ +void blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + if (blkg->parent) { + spin_lock(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock(&blkg->async_bio_lock); + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + } else { + /* never bounce for the root cgroup */ + submit_bio(bio); + } +} +EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); + +static int __init blkcg_punt_bio_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_punt_bio_init); +#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ + +/** + * bio_blkcg_css - return the blkcg CSS associated with a bio + * @bio: target bio + * + * This returns the CSS for the blkcg associated with a bio, or %NULL if not + * associated. Callers are expected to either handle %NULL or know association + * has been done prior to calling this. + */ +struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) +{ + if (!bio || !bio->bi_blkg) + return NULL; + return &bio->bi_blkg->blkcg->css; +} +EXPORT_SYMBOL_GPL(bio_blkcg_css); + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with - * @q: request_queue the new blkg is associated with + * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; - if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) - goto err_free; - + goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) - goto err_free; + goto out_exit_refcnt; + if (!blk_get_queue(disk->queue)) + goto out_free_iostat; - blkg->q = q; + blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); + blkg->blkcg = blkcg; + blkg->iostat.blkg = blkg; +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); - blkg->blkcg = blkcg; +#endif u64_stats_init(&blkg->iostat.sync); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; + } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; - if (!blkcg_policy_enabled(q, pol)) + if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); + pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) - goto err_free; - + goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; + pd->online = false; } return blkg; -err_free: - blkg_free(blkg); - return NULL; -} - -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint) -{ - struct blkcg_gq *blkg; - - /* - * Hint didn't match. Look up from the radix tree. Note that the - * hint can only be updated under queue_lock as otherwise @blkg - * could have already been removed from blkg_tree. The caller is - * responsible for grabbing queue_lock if @update_hint. - */ - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) { - if (update_hint) { - lockdep_assert_held(&q->queue_lock); - rcu_assign_pointer(blkcg->blkg_hint, blkg); - } - return blkg; - } - +out_free_pds: + while (--i >= 0) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + blk_put_queue(disk->queue); +out_free_iostat: + free_percpu(blkg->iostat_cpu); +out_exit_refcnt: + percpu_ref_exit(&blkg->refcnt); +out_free_blkg: + kfree(blkg); return NULL; } -EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ -static struct blkcg_gq *blkg_create(struct blkcg *blkcg, - struct request_queue *q, +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); + lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ - if (blk_queue_dying(q)) { + if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } @@ -253,7 +388,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; @@ -263,7 +398,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* link parent */ if (blkcg_parent(blkcg)) { - blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; @@ -281,16 +416,19 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* insert */ spin_lock(&blkcg->lock); - ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &q->blkg_list); + list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[i]); + if (blkg->pd[i]) { + if (pol->pd_online_fn) + pol->pd_online_fn(blkg->pd[i]); + blkg->pd[i]->online = true; + } } } blkg->online = true; @@ -306,26 +444,28 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, err_put_css: css_put(&blkcg->css); err_free_blkg: - blkg_free(new_blkg); + if (new_blkg) + blkg_free(new_blkg); return ERR_PTR(ret); } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest - * @q: request_queue of interest + * @disk: gendisk of interest * - * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and takes @q->queue_lock. + * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) + struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; @@ -336,9 +476,13 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup(blkcg, q, true); - if (blkg) + blkg = blkg_lookup(blkcg, q); + if (blkg) { + if (blkcg != &blkcg_root && + blkg != rcu_dereference(blkcg->blkg_hint)) + rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; + } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all @@ -351,7 +495,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { - blkg = __blkg_lookup(parent, q, false); + blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; @@ -361,7 +505,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, parent = blkcg_parent(parent); } - blkg = blkg_create(pos, q, NULL); + blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; @@ -383,21 +527,28 @@ static void blkg_destroy(struct blkcg_gq *blkg) lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); - /* Something wrong if we are trying to remove same group twice */ - WARN_ON_ONCE(list_empty(&blkg->q_node)); - WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + /* + * blkg stays on the queue list until blkg_free_workfn(), see details in + * blkg_free_workfn(), hence this function can be called from + * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before + * blkg_free_workfn(). + */ + if (hlist_unhashed(&blkg->blkcg_node)) + return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[i]); + if (blkg->pd[i] && blkg->pd[i]->online) { + blkg->pd[i]->online = false; + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[i]); + } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); - list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); /* @@ -415,22 +566,21 @@ static void blkg_destroy(struct blkcg_gq *blkg) percpu_ref_kill(&blkg->refcnt); } -/** - * blkg_destroy_all - destroy all blkgs associated with a request_queue - * @q: request_queue of interest - * - * Destroy all blkgs associated with @q. - */ -static void blkg_destroy_all(struct request_queue *q) +static void blkg_destroy_all(struct gendisk *disk) { - struct blkcg_gq *blkg, *n; + struct request_queue *q = disk->queue; + struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; + int i; restart: spin_lock_irq(&q->queue_lock); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; + if (hlist_unhashed(&blkg->blkcg_node)) + continue; + spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); @@ -447,17 +597,63 @@ restart: } } + /* + * Mark policy deactivated since policy offline has been done, and + * the free is scheduled, so future blkcg_deactivate_policy() can + * be bypassed + */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (pol) + __clear_bit(pol->plid, q->blkcg_pols); + } + q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void __blkg_clear_stat(struct blkg_iostat_set *bis) +{ + struct blkg_iostat cur = {0}; + unsigned long flags; + + flags = u64_stats_update_begin_irqsave(&bis->sync); + blkg_iostat_set(&bis->cur, &cur); + blkg_iostat_set(&bis->last, &cur); + u64_stats_update_end_irqrestore(&bis->sync, flags); +} + +static void blkg_clear_stat(struct blkcg_gq *blkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); + + __blkg_clear_stat(s); + } + __blkg_clear_stat(&blkg->iostat); +} + static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; - int i, cpu; + int i; + pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -467,13 +663,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - for_each_possible_cpu(cpu) { - struct blkg_iostat_set *bis = - per_cpu_ptr(blkg->iostat_cpu, cpu); - memset(bis, 0, sizeof(*bis)); - } - memset(&blkg->iostat, 0, sizeof(blkg->iostat)); - + blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -489,10 +679,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info->dev) - return bdi_dev_name(blkg->q->backing_dev_info); - return NULL; + if (!blkg->q->disk) + return NULL; + return bdi_dev_name(blkg->q->disk->bdi); } /** @@ -542,7 +731,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs); * @pd: policy private data of interest * @v: value to print * - * Print @v to @sf for the device assocaited with @pd. + * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { @@ -556,93 +745,145 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/* Performs queue bypass and policy enabled checks then looks up blkg. */ -static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, - const struct blkcg_policy *pol, - struct request_queue *q) +/** + * blkg_conf_init - initialize a blkg_conf_ctx + * @ctx: blkg_conf_ctx to initialize + * @input: input string + * + * Initialize @ctx which can be used to parse blkg config input string @input. + * Once initialized, @ctx can be used with blkg_conf_open_bdev() and + * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). + */ +void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) - return ERR_PTR(-EOPNOTSUPP); - return __blkg_lookup(blkcg, q, true /* update_hint */); + *ctx = (struct blkg_conf_ctx){ .input = input }; } +EXPORT_SYMBOL_GPL(blkg_conf_init); /** - * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update - * @inputp: input string pointer + * blkg_conf_open_bdev - parse and open bdev for per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching bdev. *@inputp is - * updated to point past the device node prefix. Returns an ERR_PTR() - * value on error. + * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from + * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is + * set to point past the device node prefix. * - * Use this function iff blkg_conf_prep() can't be used for some reason. + * This function may be called multiple times on @ctx and the extra calls become + * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function + * explicitly if bdev access is needed without resolving the blkcg / policy part + * of @ctx->input. Returns -errno on error. */ -struct block_device *blkcg_conf_open_bdev(char **inputp) +int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { - char *input = *inputp; + char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; + if (ctx->bdev) + return 0; + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) - return ERR_PTR(-EINVAL); + return -EINVAL; input += key_len; if (!isspace(*input)) - return ERR_PTR(-EINVAL); + return -EINVAL; input = skip_spaces(input); - bdev = blkdev_get_no_open(MKDEV(major, minor)); + bdev = blkdev_get_no_open(MKDEV(major, minor), false); if (!bdev) - return ERR_PTR(-ENODEV); + return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); - return ERR_PTR(-ENODEV); + return -ENODEV; + } + + mutex_lock(&bdev->bd_queue->rq_qos_mutex); + if (!disk_live(bdev->bd_disk)) { + blkdev_put_no_open(bdev); + mutex_unlock(&bdev->bd_queue->rq_qos_mutex); + return -ENODEV; } - *inputp = input; - return bdev; + ctx->body = input; + ctx->bdev = bdev; + return 0; +} +/* + * Similar to blkg_conf_open_bdev, but additionally freezes the queue, + * ensures the correct locking order between freeze queue and q->rq_qos_mutex. + * + * This function returns negative error on failure. On success it returns + * memflags which must be saved and later passed to blkg_conf_exit_frozen + * for restoring the memalloc scope. + */ +unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) +{ + int ret; + unsigned long memflags; + + if (ctx->bdev) + return -EINVAL; + + ret = blkg_conf_open_bdev(ctx); + if (ret < 0) + return ret; + /* + * At this point, we haven’t started protecting anything related to QoS, + * so we release q->rq_qos_mutex here, which was first acquired in blkg_ + * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing + * the queue to maintain the correct locking order. + */ + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + + memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); + mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); + + return memflags; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy - * @input: input string - * @ctx: blkg_conf_ctx to be filled + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() + * + * Parse per-blkg config update from @ctx->input and initialize @ctx + * accordingly. On success, @ctx->body points to the part of @ctx->input + * following MAJ:MIN, @ctx->bdev points to the target block device and + * @ctx->blkg to the blkg being configured. * - * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->body the - * part of @input following MAJ:MIN. This function returns with RCU read - * lock and queue lock held and must be paired with blkg_conf_finish(). + * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this + * function returns with queue lock held and must be followed by + * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) + struct blkg_conf_ctx *ctx) + __acquires(&bdev->bd_queue->queue_lock) { - struct block_device *bdev; + struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; - bdev = blkcg_conf_open_bdev(&input); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + ret = blkg_conf_open_bdev(ctx); + if (ret) + return ret; - q = bdev->bd_disk->queue; + disk = ctx->bdev->bd_disk; + q = disk->queue; - rcu_read_lock(); + /* Prevent concurrent with blkcg_deactivate_policy() */ + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(blkcg, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { + ret = -EOPNOTSUPP; goto fail_unlock; } + blkg = blkg_lookup(blkcg, q); if (blkg) goto success; @@ -656,41 +897,39 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); - while (parent && !__blkg_lookup(parent, q, false)) { + while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); - new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_NOIO); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail; + goto fail_exit; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail; + goto fail_exit; } - rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(pos, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); + ret = -EOPNOTSUPP; goto fail_preloaded; } + blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { - blkg = blkg_create(pos, q, new_blkg); + blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; @@ -703,18 +942,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - ctx->bdev = bdev; + mutex_unlock(&q->blkcg_mutex); ctx->blkg = blkg; - ctx->body = input; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); -fail: - blkdev_put_no_open(bdev); +fail_exit: + mutex_unlock(&q->blkcg_mutex); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -730,28 +967,41 @@ fail: EXPORT_SYMBOL_GPL(blkg_conf_prep); /** - * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * blkg_conf_exit - clean up per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Finish up after per-blkg config update. This function must be paired - * with blkg_conf_prep(). + * Clean up after per-blkg config update. This function must be called on all + * blkg_conf_ctx's initialized with blkg_conf_init(). */ -void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) +void blkg_conf_exit(struct blkg_conf_ctx *ctx) + __releases(&ctx->bdev->bd_queue->queue_lock) + __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { - spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); - rcu_read_unlock(); - blkdev_put_no_open(ctx->bdev); + if (ctx->blkg) { + spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); + ctx->blkg = NULL; + } + + if (ctx->bdev) { + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + blkdev_put_no_open(ctx->bdev); + ctx->body = NULL; + ctx->bdev = NULL; + } } -EXPORT_SYMBOL_GPL(blkg_conf_finish); +EXPORT_SYMBOL_GPL(blkg_conf_exit); -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +/* + * Similar to blkg_conf_exit, but also unfreezes the queue. Should be used + * when blkg_conf_open_bdev_frozen is used to open the bdev. + */ +void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) { - int i; + if (ctx->bdev) { + struct request_queue *q = ctx->bdev->bd_queue; - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; + blkg_conf_exit(ctx); + blk_mq_unfreeze_queue(q, memflags); } } @@ -775,55 +1025,108 @@ static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) } } -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, + struct blkg_iostat *last) { - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; + struct blkg_iostat delta; + unsigned long flags; - /* Root-level stats are sourced from system-wide IO stats */ - if (!cgroup_parent(css->cgroup)) - return; + /* propagate percpu delta to global */ + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); + blkg_iostat_set(&delta, cur); + blkg_iostat_sub(&delta, last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(last, &delta); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); +} + +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) +{ + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + struct llist_node *lnode; + struct blkg_iostat_set *bisc, *next_bisc; + unsigned long flags; rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + lnode = llist_del_all(lhead); + if (!lnode) + goto out; + + /* + * For covering concurrent parent blkg update from blkg_release(). + * + * When flushing from cgroup, the subsystem rstat lock is always held, + * so this lock won't cause contention most of time. + */ + raw_spin_lock_irqsave(&blkg_stat_lock, flags); + + /* + * Iterate only the iostat_cpu's queued in the lockless list. + */ + llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { + struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; + struct blkg_iostat cur; unsigned int seq; + /* + * Order assignment of `next_bisc` from `bisc->lnode.next` in + * llist_for_each_entry_safe and clearing `bisc->lqueued` for + * avoiding to assign `next_bisc` with new next pointer added + * in blk_cgroup_bio_start() in case of re-ordering. + * + * The pair barrier is implied in llist_add() in blk_cgroup_bio_start(). + */ + smp_mb(); + + WRITE_ONCE(bisc->lqueued, false); + if (bisc == &blkg->iostat) + goto propagate_up; /* propagate up to parent only */ + /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); - /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); + blkcg_iostat_update(blkg, &cur, &bisc->last); +propagate_up: /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { - u64_stats_update_begin(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); + blkcg_iostat_update(parent, &blkg->iostat.cur, + &blkg->iostat.last); + /* + * Queue parent->iostat to its blkcg's lockless + * list to propagate up to the grandparent if the + * iostat hasn't been queued yet. + */ + if (!parent->iostat.lqueued) { + struct llist_head *plhead; + + plhead = per_cpu_ptr(parent->blkcg->lhead, cpu); + llist_add(&parent->iostat.lnode, plhead); + parent->iostat.lqueued = true; + } } } - + raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); +out: rcu_read_unlock(); } +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + /* Root-level stats are sourced from system-wide IO stats */ + if (cgroup_parent(css->cgroup)) + __blkcg_rstat_flush(css_to_blkcg(css), cpu); +} + /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no - * cgroups are defined. For that reason, cgroup_rstat_flush in + * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * @@ -840,10 +1143,10 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); - struct blkcg_gq *blkg = - blk_queue_root_blkg(bdev->bd_disk->queue); + struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; + unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { @@ -863,105 +1166,83 @@ static void blkcg_fill_root_iostats(void) cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; - - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&blkg->iostat.cur, &tmp); - u64_stats_update_end(&blkg->iostat.sync); } + + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } + class_dev_iter_exit(&iter); } -static int blkcg_print_stat(struct seq_file *sf, void *v) +static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct blkcg_gq *blkg; - - if (!seq_css(sf)->parent) - blkcg_fill_root_iostats(); - else - cgroup_rstat_flush(blkcg->css.cgroup); + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + const char *dname; + unsigned seq; + int i; - rcu_read_lock(); + if (!blkg->online) + return; - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkg_iostat_set *bis = &blkg->iostat; - const char *dname; - char *buf; - u64 rbytes, wbytes, rios, wios, dbytes, dios; - size_t size = seq_get_buf(sf, &buf), off = 0; - int i; - bool has_stats = false; - unsigned seq; + dname = blkg_dev_name(blkg); + if (!dname) + return; - spin_lock_irq(&blkg->q->queue_lock); + seq_printf(s, "%s ", dname); - if (!blkg->online) - goto skip; + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } - dname = blkg_dev_name(blkg); - if (!dname) - goto skip; + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + seq_printf(s, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); + } - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; - do { - seq = u64_stats_fetch_begin(&bis->sync); - - rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; - wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; - dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; - rios = bis->cur.ios[BLKG_IOSTAT_READ]; - wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; - dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; - } while (u64_stats_fetch_retry(&bis->sync, seq)); - - if (rbytes || wbytes || rios || wios) { - has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", - rbytes, wbytes, rios, wios, - dbytes, dios); - } + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; - if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; - off += scnprintf(buf+off, size-off, - " use_delay=%d delay_nsec=%llu", - atomic_read(&blkg->use_delay), - (unsigned long long)atomic64_read(&blkg->delay_nsec)); - } + pol->pd_stat_fn(blkg->pd[i], s); + } - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; + seq_puts(s, "\n"); +} - if (!blkg->pd[i] || !pol->pd_stat_fn) - continue; +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) - has_stats = true; - off += written; - } + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + css_rstat_flush(&blkcg->css); - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); - } else { - seq_commit(sf, -1); - } - } - skip: + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(&blkg->q->queue_lock); + blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } - rcu_read_unlock(); return 0; } @@ -982,6 +1263,13 @@ static struct cftype blkcg_legacy_files[] = { { } /* terminate */ }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) +{ + return &css_to_blkcg(css)->cgwb_list; +} +#endif + /* * blkcg destruction is a three-stage process. * @@ -1004,25 +1292,6 @@ static struct cftype blkcg_legacy_files[] = { */ /** - * blkcg_css_offline - cgroup css_offline callback - * @css: css of interest - * - * This function is called when @css is about to go away. Here the cgwbs are - * offlined first and only once writeback associated with the blkcg has - * finished do we start step 2 (see above). - */ -static void blkcg_css_offline(struct cgroup_subsys_state *css) -{ - struct blkcg *blkcg = css_to_blkcg(css); - - /* this prevents anyone from attaching or migrating to this blkcg */ - wb_blkcg_offline(blkcg); - - /* put the base online pin allowing step 2 to be triggered */ - blkcg_unpin_online(blkcg); -} - -/** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * @@ -1033,7 +1302,7 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) * * This is the blkcg counterpart of ioc_release_fn(). */ -void blkcg_destroy_blkgs(struct blkcg *blkcg) +static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); @@ -1063,6 +1332,61 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) spin_unlock_irq(&blkcg->lock); } +/** + * blkcg_pin_online - pin online state + * @blkcg_css: blkcg of interest + * + * While pinned, a blkcg is kept online. This is primarily used to + * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline + * while an associated cgwb is still active. + */ +void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) +{ + refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); +} + +/** + * blkcg_unpin_online - unpin online state + * @blkcg_css: blkcg of interest + * + * This is primarily used to impedance-match blkg and cgwb lifetimes so + * that blkg doesn't go offline while an associated cgwb is still active. + * When this count goes to zero, all active cgwbs have finished so the + * blkcg can continue destruction by calling blkcg_destroy_blkgs(). + */ +void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) +{ + struct blkcg *blkcg = css_to_blkcg(blkcg_css); + + do { + struct blkcg *parent; + + if (!refcount_dec_and_test(&blkcg->online_pin)) + break; + + parent = blkcg_parent(blkcg); + blkcg_destroy_blkgs(blkcg); + blkcg = parent; + } while (blkcg); +} + +/** + * blkcg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away. Here the cgwbs are + * offlined first and only once writeback associated with the blkcg has + * finished do we start step 2 (see above). + */ +static void blkcg_css_offline(struct cgroup_subsys_state *css) +{ + /* this prevents anyone from attaching or migrating to this blkcg */ + wb_blkcg_offline(css); + + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(css); +} + static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); @@ -1078,6 +1402,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); + free_percpu(blkcg->lhead); kfree(blkcg); } @@ -1085,7 +1410,6 @@ static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; - struct cgroup_subsys_state *ret; int i; mutex_lock(&blkcg_pol_mutex); @@ -1094,12 +1418,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); + if (!blkcg) goto unlock; - } } + if (init_blkcg_llists(blkcg)) + goto free_blkcg; + for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -1114,20 +1439,17 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) { - ret = ERR_PTR(-ENOMEM); + if (!cpd) goto free_pd_blkcg; - } + blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -1141,18 +1463,18 @@ free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - + free_percpu(blkcg->lhead); +free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); - return ret; + return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg *parent = blkcg_parent(blkcg); + struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs @@ -1160,107 +1482,60 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * parent so that offline always happens towards the root. */ if (parent) - blkcg_pin_online(parent); + blkcg_pin_online(&parent->css); return 0; } -/** - * blkcg_init_queue - initialize blkcg part of request queue - * @q: request_queue to initialize - * - * Called from blk_alloc_queue(). Responsible for initializing blkcg - * part of new request_queue @q. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int blkcg_init_queue(struct request_queue *q) +void blkg_init_queue(struct request_queue *q) { + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + +int blkcg_init_disk(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; - int ret; - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ - rcu_read_lock(); + /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); - blkg = blkg_create(&blkcg_root, q, new_blkg); + blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); - ret = blk_iolatency_init(q); - if (ret) - goto err_destroy_all; - - ret = blk_ioprio_init(q); - if (ret) - goto err_destroy_all; - - ret = blk_throtl_init(q); - if (ret) - goto err_destroy_all; - return 0; -err_destroy_all: - blkg_destroy_all(q); - return ret; err_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } -/** - * blkcg_exit_queue - exit and release blkcg part of request_queue - * @q: request_queue being released - * - * Called from blk_exit_queue(). Responsible for exiting blkcg part. - */ -void blkcg_exit_queue(struct request_queue *q) +void blkcg_exit_disk(struct gendisk *disk) { - blkg_destroy_all(q); - blk_throtl_exit(q); -} - -static void blkcg_bind(struct cgroup_subsys_state *root_css) -{ - int i; - - mutex_lock(&blkcg_pol_mutex); - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - struct blkcg *blkcg; - - if (!pol || !pol->cpd_bind_fn) - continue; - - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) - if (blkcg->cpd[pol->plid]) - pol->cpd_bind_fn(blkcg->cpd[pol->plid]); - } - mutex_unlock(&blkcg_pol_mutex); + blkg_destroy_all(disk); + blk_throtl_exit(disk); } static void blkcg_exit(struct task_struct *tsk) { - if (tsk->throttle_queue) - blk_put_queue(tsk->throttle_queue); - tsk->throttle_queue = NULL; + if (tsk->throttle_disk) + put_disk(tsk->throttle_disk); + tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { @@ -1269,7 +1544,6 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, - .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", @@ -1286,14 +1560,14 @@ struct cgroup_subsys io_cgrp_subsys = { EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** - * blkcg_activate_policy - activate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_activate_policy - activate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to activate * - * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * - * Activation happens with @q bypassed, so nobody would be accessing blkgs + * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. @@ -1301,22 +1575,31 @@ EXPORT_SYMBOL_GPL(io_cgrp_subsys); * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; + unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) return 0; + /* + * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, + * for example, ioprio. Such policy will work on blkcg level, not disk + * level, and don't need to be activated. + */ + if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) + return -EINVAL; + if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to allocate parents first */ + /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1328,8 +1611,8 @@ retry: pd = pd_prealloc; pd_prealloc = NULL; } else { - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, - blkg->blkcg); + pd = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_NOWAIT); } if (!pd) { @@ -1346,23 +1629,29 @@ retry: if (pd_prealloc) pol->pd_free_fn(pd_prealloc); - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, - blkg->blkcg); + pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_KERNEL); if (pd_prealloc) goto retry; else goto enomem; } - blkg->pd[pol->plid] = pd; + spin_lock(&blkg->blkcg->lock); + pd->blkg = blkg; pd->plid = pol->plid; - } + blkg->pd[pol->plid] = pd; + + if (pol->pd_init_fn) + pol->pd_init_fn(pd); - /* all allocated, init in the same order */ - if (pol->pd_init_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) - pol->pd_init_fn(blkg->pd[pol->plid]); + if (pol->pd_online_fn) + pol->pd_online_fn(pd); + pd->online = true; + + spin_unlock(&blkg->blkcg->lock); + } __set_bit(pol->plid, q->blkcg_pols); ret = 0; @@ -1370,7 +1659,7 @@ retry: spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) @@ -1378,13 +1667,22 @@ out: return ret; enomem: - /* alloc failed, nothing's initialized yet, free everything */ + /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (blkg->pd[pol->plid]) { - pol->pd_free_fn(blkg->pd[pol->plid]); + struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; + + spin_lock(&blkcg->lock); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } + spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; @@ -1393,44 +1691,64 @@ enomem: EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** - * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to deactivate * - * Deactivate @pol on @q. Follows the same synchronization rules as + * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ -void blkcg_deactivate_policy(struct request_queue *q, +void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; + unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) - blk_mq_freeze_queue(q); + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { - if (pol->pd_offline_fn) + if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } + spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); +static void blkcg_free_all_cpd(struct blkcg_policy *pol) +{ + struct blkcg *blkcg; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } +} + /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register @@ -1443,24 +1761,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; + /* + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy + * without pd_alloc_fn/pd_free_fn can't be activated. + */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ - ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + ret = -ENOSPC; goto err_unlock; } - /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ - if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) - goto err_unlock; - /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; @@ -1471,38 +1792,36 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) + if (!cpd) { + ret = -ENOMEM; goto err_free_cpds; + } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->dfl_cftypes) + if (pol->dfl_cftypes == pol->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + } else { WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); - if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); + } mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: - if (pol->cpd_free_fn) { - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - if (blkcg->cpd[pol->plid]) { - pol->cpd_free_fn(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; - } - } - } + if (pol->cpd_free_fn) + blkcg_free_all_cpd(pol); + blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); @@ -1519,8 +1838,6 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - struct blkcg *blkcg; - mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) @@ -1535,14 +1852,9 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_free_fn) { - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - if (blkcg->cpd[pol->plid]) { - pol->cpd_free_fn(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; - } - } - } + if (pol->cpd_free_fn) + blkcg_free_all_cpd(pol); + blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); @@ -1551,25 +1863,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); -bool __blkcg_punt_bio_submit(struct bio *bio) -{ - struct blkcg_gq *blkg = bio->bi_blkg; - - /* consume the flag first */ - bio->bi_opf &= ~REQ_CGROUP_PUNT; - - /* never bounce for the root cgroup */ - if (!blkg->parent) - return false; - - spin_lock_bh(&blkg->async_bio_lock); - bio_list_add(&blkg->async_bios, bio); - spin_unlock_bh(&blkg->async_bio_lock); - - queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); - return true; -} - /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a @@ -1598,7 +1891,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && - atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { + atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); @@ -1637,7 +1930,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; @@ -1694,35 +1987,29 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we - * check to see if current->throttle_queue is set and if not this doesn't do + * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { - struct request_queue *q = current->throttle_queue; - struct cgroup_subsys_state *css; + struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; - if (!q) + if (!disk) return; - current->throttle_queue = NULL; + current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); - css = kthread_blkcg(); - if (css) - blkcg = css_to_blkcg(css); - else - blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); - + blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) @@ -1731,22 +2018,21 @@ void blkcg_maybe_throttle_current(void) blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); - blk_put_queue(q); + put_disk(disk); return; out: rcu_read_unlock(); - blk_put_queue(q); } /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q: the request queue IO was submitted on + * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for - * instance will only have a request_queue at that point. This set's the + * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * @@ -1755,18 +2041,19 @@ out: * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; - if (current->throttle_queue != q) { - if (!blk_get_queue(q)) + if (current->throttle_disk != disk) { + if (test_bit(GD_DEAD, &disk->state)) return; + get_device(disk_to_dev(disk)); - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; + if (current->throttle_disk) + put_disk(current->throttle_disk); + current->throttle_disk = disk; } if (use_memdelay) @@ -1806,8 +2093,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), - bio->bi_bdev->bd_disk->queue); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1843,8 +2129,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); - bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; + blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); + bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); @@ -1862,10 +2148,13 @@ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; + if (blk_op_is_passthrough(bio->bi_opf)) + return; + rcu_read_lock(); if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; + css = bio_blkcg_css(bio); else css = blkcg_css(); @@ -1882,12 +2171,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg); */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_blkg) { - if (dst->bi_blkg) - blkg_put(dst->bi_blkg); - blkg_get(src->bi_blkg); - dst->bi_blkg = src->bi_blkg; - } + if (src->bi_blkg) + bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); @@ -1902,12 +2187,21 @@ static int blk_cgroup_io_type(struct bio *bio) void blk_cgroup_bio_start(struct bio *bio) { + struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; + unsigned long flags; + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) + return; + + /* Root-level stats are sourced from system-wide IO stats */ + if (!cgroup_parent(blkcg->css.cgroup)) + return; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); - u64_stats_update_begin(&bis->sync); + flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split @@ -1919,22 +2213,38 @@ void blk_cgroup_bio_start(struct bio *bio) } bis->cur.ios[rwd]++; - u64_stats_update_end(&bis->sync); - if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + /* + * If the iostat_cpu isn't in a lockless list, put it into the + * list to indicate that a stat update is pending. + */ + if (!READ_ONCE(bis->lqueued)) { + struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); + + llist_add(&bis->lnode, lhead); + WRITE_ONCE(bis->lqueued, true); + } + + u64_stats_update_end_irqrestore(&bis->sync, flags); + css_rstat_updated(&blkcg->css, cpu); put_cpu(); } -static int __init blkcg_init(void) +bool blk_cgroup_congested(void) { - blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", - WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND | WQ_SYSFS, 0); - if (!blkcg_punt_bio_wq) - return -ENOMEM; - return 0; + struct blkcg *blkcg; + bool ret = false; + + rcu_read_lock(); + for (blkcg = css_to_blkcg(blkcg_css()); blkcg; + blkcg = blkcg_parent(blkcg)) { + if (atomic_read(&blkcg->congestion_count)) { + ret = true; + break; + } + } + rcu_read_unlock(); + return ret; } -subsys_initcall(blkcg_init); module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); |
