diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-iosched.c | 66 | ||||
-rw-r--r-- | block/bfq-iosched.h | 13 | ||||
-rw-r--r-- | block/blk-ioc.c | 16 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 223 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 12 | ||||
-rw-r--r-- | block/blk-mq.c | 16 | ||||
-rw-r--r-- | block/blk-settings.c | 33 | ||||
-rw-r--r-- | block/blk.h | 4 | ||||
-rw-r--r-- | block/elevator.c | 38 | ||||
-rw-r--r-- | block/elevator.h | 16 | ||||
-rw-r--r-- | block/kyber-iosched.c | 20 | ||||
-rw-r--r-- | block/mq-deadline.c | 30 |
12 files changed, 287 insertions, 200 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0cb1e9873aab..3bf76902f07f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -454,17 +454,10 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) */ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) { - struct bfq_io_cq *icq; - unsigned long flags; - if (!current->io_context) return NULL; - spin_lock_irqsave(&q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(q)); - spin_unlock_irqrestore(&q->queue_lock, flags); - - return icq; + return icq_to_bic(ioc_lookup_icq(q)); } /* @@ -701,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_io_cq *bic = bfq_bic_lookup(data->q); - int depth; - unsigned limit = data->q->nr_requests; - unsigned int act_idx; + unsigned int limit, act_idx; /* Sync reads have full depth available */ - if (op_is_sync(opf) && !op_is_write(opf)) { - depth = 0; - } else { - depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; - limit = (limit * depth) >> bfqd->full_depth_shift; - } + if (op_is_sync(opf) && !op_is_write(opf)) + limit = data->q->nr_requests; + else + limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { /* Fast path to check if bfqq is already allocated. */ @@ -725,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * available requests and thus starve other entities. */ if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { - depth = 1; + limit = 1; break; } } + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); - if (depth) - data->shallow_depth = depth; + __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit); + + if (limit < data->q->nr_requests) + data->shallow_depth = limit; } static struct bfq_queue * @@ -2457,15 +2448,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *free = NULL; - /* - * bfq_bic_lookup grabs the queue_lock: invoke it now and - * store its return value for later use, to avoid nesting - * queue_lock inside the bfqd->lock. We assume that the bic - * returned by bfq_bic_lookup does not go away before - * bfqd->lock is taken. - */ struct bfq_io_cq *bic = bfq_bic_lookup(q); + struct request *free = NULL; bool ret; spin_lock_irq(&bfqd->lock); @@ -7128,9 +7112,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) */ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int depth = 1U << bt->sb.shift; + unsigned int nr_requests = bfqd->queue->nr_requests; - bfqd->full_depth_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. @@ -7142,13 +7125,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max(depth >> 1, 1U); + bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); + bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -7158,9 +7141,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); + bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); + bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) @@ -7232,22 +7215,16 @@ static void bfq_init_root_group(struct bfq_group *root_group, root_group->sched_data.bfq_class_idle_last_service = jiffies; } -static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) { struct bfq_data *bfqd; - struct elevator_queue *eq; unsigned int i; struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); - if (!bfqd) { - kobject_put(&eq->kobj); + if (!bfqd) return -ENOMEM; - } + eq->elevator_data = bfqd; spin_lock_irq(&q->queue_lock); @@ -7405,7 +7382,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) out_free: kfree(bfqd); - kobject_put(&eq->kobj); return -ENOMEM; } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 687a3a7ba784..34a498e6b2a5 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -427,9 +427,6 @@ struct bfq_iocq_bfqq_data { */ bool saved_IO_bound; - u64 saved_io_start_time; - u64 saved_tot_idle_time; - /* * Same purpose as the previous fields for the values of the * field keeping the queue's belonging to a large burst @@ -450,6 +447,9 @@ struct bfq_iocq_bfqq_data { */ unsigned int saved_weight; + u64 saved_io_start_time; + u64 saved_tot_idle_time; + /* * Similar to previous fields: save wr information. */ @@ -457,13 +457,13 @@ struct bfq_iocq_bfqq_data { unsigned long saved_last_wr_start_finish; unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; - unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; + unsigned int saved_wr_cur_max_time; /* Save also injection state */ - u64 saved_last_serv_time_ns; unsigned int saved_inject_limit; unsigned long saved_decrease_time_jif; + u64 saved_last_serv_time_ns; /* candidate queue for a stable merge (due to close creation time) */ struct bfq_queue *stable_merge_bfqq; @@ -813,8 +813,7 @@ struct bfq_data { * Depth limits used in bfq_limit_depth (see comments on the * function) */ - unsigned int word_depths[2][2]; - unsigned int full_depth_shift; + unsigned int async_depths[2][2]; /* * Number of independent actuators. This is equal to 1 in diff --git a/block/blk-ioc.c b/block/blk-ioc.c index ce82770c72ab..9fda3906e5f5 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -308,24 +308,23 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_BLK_ICQ /** - * ioc_lookup_icq - lookup io_cq from ioc + * ioc_lookup_icq - lookup io_cq from ioc in io issue path * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called - * with @q->queue_lock held. + * from io issue path, either return NULL if current issue io to @q for the + * first time, or return a valid icq. */ struct io_cq *ioc_lookup_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct io_cq *icq; - lockdep_assert_held(&q->queue_lock); - /* * icq's are indexed from @ioc using radix tree and hint pointer, - * both of which are protected with RCU. All removals are done - * holding both q and ioc locks, and we're holding q lock - if we - * find a icq which points to us, it's guaranteed to be valid. + * both of which are protected with RCU, io issue path ensures that + * both request_queue and current task are valid, the found icq + * is guaranteed to be valid until the io is done. */ rcu_read_lock(); icq = rcu_dereference(ioc->icq_hint); @@ -419,10 +418,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q) task_unlock(current); } else { get_io_context(ioc); - - spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(q); - spin_unlock_irq(&q->queue_lock); } if (!icq) { diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 55a0fd105147..e2ce4a28e6c9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - if (blk_mq_is_shared_tags(q->tag_set->flags)) { - hctx->sched_tags = q->sched_shared_tags; - return 0; - } - - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, - q->nr_requests); - - if (!hctx->sched_tags) - return -ENOMEM; - return 0; -} - -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) -{ - blk_mq_free_rq_map(queue->sched_shared_tags); - queue->sched_shared_tags = NULL; -} - /* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - if (!blk_mq_is_shared_tags(flags)) - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } - } + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = NULL; if (blk_mq_is_shared_tags(flags)) - blk_mq_exit_sched_shared_tags(q); -} - -static int blk_mq_init_sched_shared_tags(struct request_queue *queue) -{ - struct blk_mq_tag_set *set = queue->tag_set; - - /* - * Set initial depth at max so that we don't need to reallocate for - * updating nr_requests. - */ - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, - BLK_MQ_NO_HCTX_IDX, - MAX_SCHED_RQ); - if (!queue->sched_shared_tags) - return -ENOMEM; - - blk_mq_tag_update_sched_shared_tags(queue); - - return 0; + q->sched_shared_tags = NULL; } void blk_mq_sched_reg_debugfs(struct request_queue *q) @@ -458,8 +411,140 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q) mutex_unlock(&q->debugfs_mutex); } +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set) +{ + unsigned long i; + + /* Shared tags are stored at index 0 in @tags. */ + if (blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX); + else { + for (i = 0; i < et->nr_hw_queues; i++) + blk_mq_free_map_and_rqs(set, et->tags[i], i); + } + + kfree(et); +} + +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set) +{ + struct request_queue *q; + struct elevator_tags *et; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = xa_load(et_table, q->id); + if (unlikely(!et)) + WARN_ON_ONCE(1); + else + blk_mq_free_sched_tags(et, set); + } + } +} + +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues) +{ + unsigned int nr_tags; + int i; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + if (blk_mq_is_shared_tags(set->flags)) + nr_tags = 1; + else + nr_tags = nr_hw_queues; + + et = kmalloc(sizeof(struct elevator_tags) + + nr_tags * sizeof(struct blk_mq_tags *), gfp); + if (!et) + return NULL; + /* + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. + */ + et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, + BLKDEV_DEFAULT_RQ); + et->nr_hw_queues = nr_hw_queues; + + if (blk_mq_is_shared_tags(set->flags)) { + /* Shared tags are stored at index 0 in @tags. */ + et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!et->tags[0]) + goto out; + } else { + for (i = 0; i < et->nr_hw_queues; i++) { + et->tags[i] = blk_mq_alloc_map_and_rqs(set, i, + et->nr_requests); + if (!et->tags[i]) + goto out_unwind; + } + } + + return et; +out_unwind: + while (--i >= 0) + blk_mq_free_map_and_rqs(set, et->tags[i], i); +out: + kfree(et); + return NULL; +} + +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues) +{ + struct request_queue *q; + struct elevator_tags *et; + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + /* + * Accessing q->elevator without holding q->elevator_lock is + * safe because we're holding here set->update_nr_hwq_lock in + * the writer context. So, scheduler update/switch code (which + * acquires the same lock but in the reader context) can't run + * concurrently. + */ + if (q->elevator) { + et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + if (!et) + goto out_unwind; + if (xa_insert(et_table, q->id, et, gfp)) + goto out_free_tags; + } + } + return 0; +out_free_tags: + blk_mq_free_sched_tags(et, set); +out_unwind: + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { + if (q->elevator) { + et = xa_load(et_table, q->id); + if (et) + blk_mq_free_sched_tags(et, set); + } + } + return -ENOMEM; +} + /* caller must have a reference to @e, will grab another one if successful */ -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et) { unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; @@ -467,36 +552,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - /* - * Default to double of smaller one between hw queue_depth and 128, - * since we don't split into sync/async like the old code did. - * Additionally, this is a per-hw queue depth. - */ - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_DEFAULT_RQ); + eq = elevator_alloc(q, e, et); + if (!eq) + return -ENOMEM; + + q->nr_requests = et->nr_requests; if (blk_mq_is_shared_tags(flags)) { - ret = blk_mq_init_sched_shared_tags(q); - if (ret) - return ret; + /* Shared tags are stored at index 0 in @et->tags. */ + q->sched_shared_tags = et->tags[0]; + blk_mq_tag_update_sched_shared_tags(q); } queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); - if (ret) - goto err_free_map_and_rqs; + if (blk_mq_is_shared_tags(flags)) + hctx->sched_tags = q->sched_shared_tags; + else + hctx->sched_tags = et->tags[i]; } - ret = e->ops.init_sched(q, e); + ret = e->ops.init_sched(q, eq); if (ret) - goto err_free_map_and_rqs; + goto out; queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { - eq = q->elevator; - blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -505,10 +587,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) } return 0; -err_free_map_and_rqs: - blk_mq_sched_free_rqs(q); +out: blk_mq_sched_tags_teardown(q, flags); - + kobject_put(&eq->kobj); q->elevator = NULL; return ret; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1326526bb733..b554e1d55950 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *et); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); +struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, + unsigned int nr_hw_queues); +int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set, unsigned int nr_hw_queues); +void blk_mq_free_sched_tags(struct elevator_tags *et, + struct blk_mq_tag_set *set); +void blk_mq_free_sched_tags_batch(struct xarray *et_table, + struct blk_mq_tag_set *set); + static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9692fa4c3ef2..b67d6c02eceb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4974,12 +4974,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, - struct xarray *elv_tbl) + struct xarray *elv_tbl, struct xarray *et_tbl) { struct elevator_type *e = xa_load(elv_tbl, q->id); + struct elevator_tags *t = xa_load(et_tbl, q->id); /* The elv_update_nr_hw_queues unfreezes the queue. */ - elv_update_nr_hw_queues(q, e); + elv_update_nr_hw_queues(q, e, t); /* Drop the reference acquired in blk_mq_elv_switch_none. */ if (e) @@ -5031,7 +5032,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; - struct xarray elv_tbl; + struct xarray elv_tbl, et_tbl; lockdep_assert_held(&set->tag_list_lock); @@ -5044,6 +5045,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, memflags = memalloc_noio_save(); + xa_init(&et_tbl); + if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) + goto out_memalloc_restore; + xa_init(&elv_tbl); list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5087,7 +5092,7 @@ fallback: switch_back: /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_elv_switch_back(q, &elv_tbl); + blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); @@ -5098,7 +5103,8 @@ switch_back: } xa_destroy(&elv_tbl); - + xa_destroy(&et_tbl); +out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ diff --git a/block/blk-settings.c b/block/blk-settings.c index 91449147bae9..07874e9b609f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -62,16 +62,24 @@ EXPORT_SYMBOL(blk_set_stacking_limits); void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim) { + u64 io_opt = lim->io_opt; + /* * For read-ahead of large files to be effective, we need to read ahead - * at least twice the optimal I/O size. + * at least twice the optimal I/O size. For rotational devices that do + * not report an optimal I/O size (e.g. ATA HDDs), use the maximum I/O + * size to avoid falling back to the (rather inefficient) small default + * read-ahead size. * * There is no hardware limitation for the read-ahead size and the user * might have increased the read-ahead size through sysfs, so don't ever * decrease it. */ + if (!io_opt && (lim->features & BLK_FEAT_ROTATIONAL)) + io_opt = (u64)lim->max_sectors << SECTOR_SHIFT; + bdi->ra_pages = max3(bdi->ra_pages, - lim->io_opt * 2 / PAGE_SIZE, + io_opt * 2 >> PAGE_SHIFT, VM_READAHEAD_PAGES); bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } @@ -312,8 +320,12 @@ int blk_validate_limits(struct queue_limits *lim) pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size); return -EINVAL; } - if (lim->physical_block_size < lim->logical_block_size) + if (lim->physical_block_size < lim->logical_block_size) { lim->physical_block_size = lim->logical_block_size; + } else if (!is_power_of_2(lim->physical_block_size)) { + pr_warn("Invalid physical block size (%d)\n", lim->physical_block_size); + return -EINVAL; + } /* * The minimum I/O size defaults to the physical block size unless @@ -388,12 +400,19 @@ int blk_validate_limits(struct queue_limits *lim) lim->max_discard_sectors = min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); + /* + * When discard is not supported, discard_granularity should be reported + * as 0 to userspace. + */ + if (lim->max_discard_sectors) + lim->discard_granularity = + max(lim->discard_granularity, lim->physical_block_size); + else + lim->discard_granularity = 0; + if (!lim->max_discard_segments) lim->max_discard_segments = 1; - if (lim->discard_granularity < lim->physical_block_size) - lim->discard_granularity = lim->physical_block_size; - /* * By default there is no limit on the segment boundary alignment, * but if there is one it can't be smaller than the page size as @@ -849,7 +868,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, } /* chunk_sectors a multiple of the physical block size? */ - if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { + if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) { t->chunk_sectors = 0; t->flags |= BLK_FLAG_MISALIGNED; ret = -1; diff --git a/block/blk.h b/block/blk.h index 76901a39997f..0a2eccf28ca4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -12,6 +12,7 @@ #include "blk-crypto-internal.h" struct elevator_type; +struct elevator_tags; /* * Default upper limit for the software max_sectors limit used for regular I/Os. @@ -330,7 +331,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e); +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *t); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 88f8f36bed98..fe96c6f4753c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -54,6 +54,8 @@ struct elv_change_ctx { struct elevator_queue *old; /* for registering new elevator */ struct elevator_queue *new; + /* holds sched tags data */ + struct elevator_tags *et; }; static DEFINE_SPINLOCK(elv_list_lock); @@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name) static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, - struct elevator_type *e) + struct elevator_type *e, struct elevator_tags *et) { struct elevator_queue *eq; @@ -145,10 +147,10 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); + eq->et = et; return eq; } -EXPORT_SYMBOL(elevator_alloc); static void elevator_release(struct kobject *kobj) { @@ -166,7 +168,6 @@ static void elevator_exit(struct request_queue *q) lockdep_assert_held(&q->elevator_lock); ioc_clear_queue(q); - blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); @@ -592,7 +593,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) } if (new_e) { - ret = blk_mq_init_sched(q, new_e); + ret = blk_mq_init_sched(q, new_e, ctx->et); if (ret) goto out_unfreeze; ctx->new = q->elevator; @@ -627,8 +628,10 @@ static void elv_exit_and_release(struct request_queue *q) elevator_exit(q); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - if (e) + if (e) { + blk_mq_free_sched_tags(e->et, q->tag_set); kobject_put(&e->kobj); + } } static int elevator_change_done(struct request_queue *q, @@ -641,6 +644,7 @@ static int elevator_change_done(struct request_queue *q, &ctx->old->flags); elv_unregister_queue(q, ctx->old); + blk_mq_free_sched_tags(ctx->old->et, q->tag_set); kobject_put(&ctx->old->kobj); if (enable_wbt) wbt_enable_default(q->disk); @@ -659,9 +663,16 @@ static int elevator_change_done(struct request_queue *q, static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) { unsigned int memflags; + struct blk_mq_tag_set *set = q->tag_set; int ret = 0; - lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); + lockdep_assert_held(&set->update_nr_hwq_lock); + + if (strncmp(ctx->name, "none", 4)) { + ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + if (!ctx->et) + return -ENOMEM; + } memflags = blk_mq_freeze_queue(q); /* @@ -681,6 +692,11 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) blk_mq_unfreeze_queue(q, memflags); if (!ret) ret = elevator_change_done(q, ctx); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (ctx->et && !ctx->new) + blk_mq_free_sched_tags(ctx->et, set); return ret; } @@ -689,8 +705,10 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, + struct elevator_tags *t) { + struct blk_mq_tag_set *set = q->tag_set; struct elv_change_ctx ctx = {}; int ret = -ENODEV; @@ -698,6 +716,7 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { ctx.name = e->elevator_name; + ctx.et = t; mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ @@ -707,6 +726,11 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx)); + /* + * Free sched tags if it's allocated but we couldn't switch elevator. + */ + if (t && !ctx.new) + blk_mq_free_sched_tags(t, set); } /* diff --git a/block/elevator.h b/block/elevator.h index a07ce773a38f..adc5c157e17e 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -23,8 +23,17 @@ enum elv_merge { struct blk_mq_alloc_data; struct blk_mq_hw_ctx; +struct elevator_tags { + /* num. of hardware queues for which tags are allocated */ + unsigned int nr_hw_queues; + /* depth used while allocating tags */ + unsigned int nr_requests; + /* shared tag is stored at index 0 */ + struct blk_mq_tags *tags[]; +}; + struct elevator_mq_ops { - int (*init_sched)(struct request_queue *, struct elevator_type *); + int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); @@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); struct elevator_queue { struct elevator_type *type; + struct elevator_tags *et; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; @@ -152,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); -extern struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *); +struct elevator_queue *elevator_alloc(struct request_queue *, + struct elevator_type *, struct elevator_tags *); /* * Helper functions. diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4dba8405bd01..70cbc7b2deb4 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -157,10 +157,7 @@ struct kyber_queue_data { */ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; - /* - * Async request percentage, converted to per-word depth for - * sbitmap_get_shallow(). - */ + /* Number of allowed async requests. */ unsigned int async_depth; struct kyber_cpu_latency __percpu *cpu_latency; @@ -402,20 +399,13 @@ err: return ERR_PTR(ret); } -static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) +static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct kyber_queue_data *kqd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; kqd = kyber_queue_data_alloc(q); - if (IS_ERR(kqd)) { - kobject_put(&eq->kobj); + if (IS_ERR(kqd)) return PTR_ERR(kqd); - } blk_stat_enable_accounting(q); @@ -454,10 +444,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int shift = tags->bitmap_tags.sb.shift; - - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 2edf1cac06d5..b9b7cdf1d3c9 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -488,20 +488,6 @@ unlock: } /* - * 'depth' is a number in the range 1..INT_MAX representing a number of - * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since - * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). - * Values larger than q->nr_requests have the same effect as q->nr_requests. - */ -static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) -{ - struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; - const unsigned int nrr = hctx->queue->nr_requests; - - return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; -} - -/* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ @@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ - data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); + data->shallow_depth = dd->async_depth; } /* Called by blk_mq_update_nr_requests(). */ @@ -568,20 +554,14 @@ static void dd_exit_sched(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct deadline_data *dd; - struct elevator_queue *eq; enum dd_prio prio; - int ret = -ENOMEM; - - eq = elevator_alloc(q, e); - if (!eq) - return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) - goto put_eq; + return -ENOMEM; eq->elevator_data = dd; @@ -608,10 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) q->elevator = eq; return 0; - -put_eq: - kobject_put(&eq->kobj); - return ret; } /* |