summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig3
-rw-r--r--block/Makefile1
-rw-r--r--block/bfq-cgroup.c4
-rw-r--r--block/bfq-iosched.c80
-rw-r--r--block/bfq-iosched.h26
-rw-r--r--block/bfq-wf2q.c13
-rw-r--r--block/bio-integrity.c12
-rw-r--r--block/bio.c202
-rw-r--r--block/blk-cgroup.c123
-rw-r--r--block/blk-core.c270
-rw-r--r--block/blk-integrity.c12
-rw-r--r--block/blk-iolatency.c230
-rw-r--r--block/blk-merge.c88
-rw-r--r--block/blk-mq-debugfs.c10
-rw-r--r--block/blk-mq-sched.h4
-rw-r--r--block/blk-mq-tag.c69
-rw-r--r--block/blk-mq.c9
-rw-r--r--block/blk-pm.c216
-rw-r--r--block/blk-pm.h69
-rw-r--r--block/blk-stat.c1
-rw-r--r--block/blk-throttle.c54
-rw-r--r--block/blk.h71
-rw-r--r--block/bounce.c4
-rw-r--r--block/cfq-iosched.c4
-rw-r--r--block/elevator.c22
-rw-r--r--block/genhd.c19
-rw-r--r--block/kyber-iosched.c547
27 files changed, 1368 insertions, 795 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 1f2469a0123c..85263e7bded6 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -228,4 +228,7 @@ config BLK_MQ_RDMA
depends on BLOCK && INFINIBAND
default y
+config BLK_PM
+ def_bool BLOCK && PM
+
source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 572b33f32c07..27eac600474f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
+obj-$(CONFIG_BLK_PM) += blk-pm.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fe5952d117d..d9a7916ff0ab 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
uint64_t serial_nr;
rcu_read_lock();
- serial_nr = bio_blkcg(bio)->css.serial_nr;
+ serial_nr = __bio_blkcg(bio)->css.serial_nr;
/*
* Check whether blkcg has changed. The condition may trigger
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
goto out;
- bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
/*
* Update blkg_path for bfq_log_* functions. We cache this
* path, and update it here, for the following
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 653100fb719e..1a1b80dfd69d 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3182,6 +3182,13 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
}
+static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
+{
+ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+ blk_queue_nonrot(bfqq->bfqd->queue) &&
+ bfqq->bfqd->hw_tag;
+}
+
/**
* bfq_bfqq_expire - expire a queue.
* @bfqd: device owning the queue.
@@ -3291,6 +3298,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
if (ref == 1) /* bfqq is gone, no more actions on it */
return;
+ bfqq->injected_service = 0;
+
/* mark bfqq as waiting a request only if a bic still points to it */
if (!bfq_bfqq_busy(bfqq) &&
reason != BFQQE_BUDGET_TIMEOUT &&
@@ -3571,7 +3580,12 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
* whether bfqq is being weight-raised, because
* bfq_symmetric_scenario() does not take into account also
* weight-raised queues (see comments on
- * bfq_weights_tree_add()).
+ * bfq_weights_tree_add()). In particular, if bfqq is being
+ * weight-raised, it is important to idle only if there are
+ * other, non-weight-raised queues that may steal throughput
+ * to bfqq. Actually, we should be even more precise, and
+ * differentiate between interactive weight raising and
+ * soft real-time weight raising.
*
* As a side note, it is worth considering that the above
* device-idling countermeasures may however fail in the
@@ -3583,7 +3597,8 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
* to let requests be served in the desired order until all
* the requests already queued in the device have been served.
*/
- asymmetric_scenario = bfqq->wr_coeff > 1 ||
+ asymmetric_scenario = (bfqq->wr_coeff > 1 &&
+ bfqd->wr_busy_queues < bfqd->busy_queues) ||
!bfq_symmetric_scenario(bfqd);
/*
@@ -3629,6 +3644,30 @@ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
}
+static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+
+ /*
+ * A linear search; but, with a high probability, very few
+ * steps are needed to find a candidate queue, i.e., a queue
+ * with enough budget left for its next request. In fact:
+ * - BFQ dynamically updates the budget of every queue so as
+ * to accommodate the expected backlog of the queue;
+ * - if a queue gets all its requests dispatched as injected
+ * service, then the queue is removed from the active list
+ * (and re-added only if it gets new requests, but with
+ * enough budget for its new backlog).
+ */
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+ bfq_bfqq_budget_left(bfqq))
+ return bfqq;
+
+ return NULL;
+}
+
/*
* Select a queue for service. If we have a current queue in service,
* check whether to continue servicing it, or retrieve and set a new one.
@@ -3710,10 +3749,19 @@ check_queue:
* No requests pending. However, if the in-service queue is idling
* for a new request, or has requests waiting for a completion and
* may idle after their completion, then keep it anyway.
+ *
+ * Yet, to boost throughput, inject service from other queues if
+ * possible.
*/
if (bfq_bfqq_wait_request(bfqq) ||
(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
- bfqq = NULL;
+ if (bfq_bfqq_injectable(bfqq) &&
+ bfqq->injected_service * bfqq->inject_coeff <
+ bfqq->entity.service * 10)
+ bfqq = bfq_choose_bfqq_for_injection(bfqd);
+ else
+ bfqq = NULL;
+
goto keep_queue;
}
@@ -3803,6 +3851,14 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
bfq_dispatch_remove(bfqd->queue, rq);
+ if (bfqq != bfqd->in_service_queue) {
+ if (likely(bfqd->in_service_queue))
+ bfqd->in_service_queue->injected_service +=
+ bfq_serv_to_charge(rq, bfqq);
+
+ goto return_rq;
+ }
+
/*
* If weight raising has to terminate for bfqq, then next
* function causes an immediate update of bfqq's weight,
@@ -3821,13 +3877,12 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
* belongs to CLASS_IDLE and other queues are waiting for
* service.
*/
- if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
- goto expire;
-
- return rq;
+ if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
+ goto return_rq;
-expire:
bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+
+return_rq:
return rq;
}
@@ -4232,6 +4287,13 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_mark_bfqq_has_short_ttime(bfqq);
bfq_mark_bfqq_sync(bfqq);
bfq_mark_bfqq_just_created(bfqq);
+ /*
+ * Aggressively inject a lot of service: up to 90%.
+ * This coefficient remains constant during bfqq life,
+ * but this behavior might be changed, after enough
+ * testing and tuning.
+ */
+ bfqq->inject_coeff = 1;
} else
bfq_clear_bfqq_sync(bfqq);
@@ -4297,7 +4359,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
rcu_read_lock();
- bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+ bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
if (!bfqg) {
bfqq = &bfqd->oom_bfqq;
goto out;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a8a2e5aca4d4..37d627afdc2e 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -351,6 +351,32 @@ struct bfq_queue {
unsigned long split_time; /* time of last split */
unsigned long first_IO_time; /* time of first I/O for this queue */
+
+ /* max service rate measured so far */
+ u32 max_service_rate;
+ /*
+ * Ratio between the service received by bfqq while it is in
+ * service, and the cumulative service (of requests of other
+ * queues) that may be injected while bfqq is empty but still
+ * in service. To increase precision, the coefficient is
+ * measured in tenths of unit. Here are some example of (1)
+ * ratios, (2) resulting percentages of service injected
+ * w.r.t. to the total service dispatched while bfqq is in
+ * service, and (3) corresponding values of the coefficient:
+ * 1 (50%) -> 10
+ * 2 (33%) -> 20
+ * 10 (9%) -> 100
+ * 9.9 (9%) -> 99
+ * 1.5 (40%) -> 15
+ * 0.5 (66%) -> 5
+ * 0.1 (90%) -> 1
+ *
+ * So, if the coefficient is lower than 10, then
+ * injected service is more than bfqq service.
+ */
+ unsigned int inject_coeff;
+ /* amount of service injected in current service slot */
+ unsigned int injected_service;
};
/**
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index ae52bff43ce4..ff7c2d470bb8 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1181,10 +1181,17 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
st = bfq_entity_service_tree(entity);
is_in_service = entity == sd->in_service_entity;
- if (is_in_service) {
- bfq_calc_finish(entity, entity->service);
+ bfq_calc_finish(entity, entity->service);
+
+ if (is_in_service)
sd->in_service_entity = NULL;
- }
+ else
+ /*
+ * Non in-service entity: nobody will take care of
+ * resetting its service counter on expiration. Do it
+ * now.
+ */
+ entity->service = 0;
if (entity->tree == &st->active)
bfq_active_extract(st, entity);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 67b5fb861a51..290af497997b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -306,6 +306,8 @@ bool bio_integrity_prep(struct bio *bio)
if (bio_data_dir(bio) == WRITE) {
bio_integrity_process(bio, &bio->bi_iter,
bi->profile->generate_fn);
+ } else {
+ bip->bio_iter = bio->bi_iter;
}
return true;
@@ -331,20 +333,14 @@ static void bio_integrity_verify_fn(struct work_struct *work)
container_of(work, struct bio_integrity_payload, bip_work);
struct bio *bio = bip->bip_bio;
struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
- struct bvec_iter iter = bio->bi_iter;
/*
* At the moment verify is called bio's iterator was advanced
* during split and completion, we need to rewind iterator to
* it's original position.
*/
- if (bio_rewind_iter(bio, &iter, iter.bi_done)) {
- bio->bi_status = bio_integrity_process(bio, &iter,
- bi->profile->verify_fn);
- } else {
- bio->bi_status = BLK_STS_IOERR;
- }
-
+ bio->bi_status = bio_integrity_process(bio, &bip->bio_iter,
+ bi->profile->verify_fn);
bio_integrity_free(bio);
bio_endio(bio);
}
diff --git a/block/bio.c b/block/bio.c
index 0093bed81c0e..17a8b0aa7050 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -609,7 +609,9 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
bio->bi_iter = bio_src->bi_iter;
bio->bi_io_vec = bio_src->bi_io_vec;
- bio_clone_blkcg_association(bio, bio_src);
+ bio_clone_blkg_association(bio, bio_src);
+
+ blkcg_bio_issue_init(bio);
}
EXPORT_SYMBOL(__bio_clone_fast);
@@ -729,7 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
}
/* If we may be able to merge these biovecs, force a recount */
- if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+ if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec))
bio_clear_flag(bio, BIO_SEG_VALID);
done:
@@ -827,6 +829,8 @@ int bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_add_page);
+#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
+
/**
* __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
* @bio: bio to add pages to
@@ -839,38 +843,35 @@ EXPORT_SYMBOL(bio_add_page);
*/
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
- unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
+ unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+ unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
+ ssize_t size, left;
+ unsigned len, i;
size_t offset;
- ssize_t size;
+
+ /*
+ * Move page array up in the allocated memory for the bio vecs as far as
+ * possible so that we can start filling biovecs from the beginning
+ * without overwriting the temporary page array.
+ */
+ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
+ pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
- idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
- /*
- * Deep magic below: We need to walk the pinned pages backwards
- * because we are abusing the space allocated for the bio_vecs
- * for the page array. Because the bio_vecs are larger than the
- * page pointers by definition this will always work. But it also
- * means we can't use bio_add_page, so any changes to it's semantics
- * need to be reflected here as well.
- */
- bio->bi_iter.bi_size += size;
- bio->bi_vcnt += nr_pages;
+ for (left = size, i = 0; left > 0; left -= len, i++) {
+ struct page *page = pages[i];
- while (idx--) {
- bv[idx].bv_page = pages[idx];
- bv[idx].bv_len = PAGE_SIZE;
- bv[idx].bv_offset = 0;
+ len = min_t(size_t, PAGE_SIZE - offset, left);
+ if (WARN_ON_ONCE(bio_add_page(bio, page, len, offset) != len))
+ return -EINVAL;
+ offset = 0;
}
- bv[0].bv_offset += offset;
- bv[0].bv_len -= offset;
- bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
-
iov_iter_advance(iter, size);
return 0;
}
@@ -1807,7 +1808,6 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_integrity_trim(split);
bio_advance(bio, split->bi_iter.bi_size);
- bio->bi_iter.bi_done = 0;
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(split, BIO_TRACE_COMPLETION);
@@ -1956,69 +1956,131 @@ EXPORT_SYMBOL(bioset_init_from_src);
#ifdef CONFIG_BLK_CGROUP
-#ifdef CONFIG_MEMCG
/**
- * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * bio_associate_blkg - associate a bio with the a blkg
* @bio: target bio
- * @page: the page to lookup the blkcg from
+ * @blkg: the blkg to associate
+ *
+ * This tries to associate @bio with the specified blkg. Association failure
+ * is handled by walking up the blkg tree. Therefore, the blkg associated can
+ * be anything between @blkg and the root_blkg. This situation only happens
+ * when a cgroup is dying and then the remaining bios will spill to the closest
+ * alive blkg.
*
- * Associate @bio with the blkcg from @page's owning memcg. This works like
- * every other associate function wrt references.
+ * A reference will be taken on the @blkg and will be released when @bio is
+ * freed.
*/
-int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
{
- struct cgroup_subsys_state *blkcg_css;
-
- if (unlikely(bio->bi_css))
+ if (unlikely(bio->bi_blkg))
return -EBUSY;
- if (!page->mem_cgroup)
- return 0;
- blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
- &io_cgrp_subsys);
- bio->bi_css = blkcg_css;
+ bio->bi_blkg = blkg_tryget_closest(blkg);
return 0;
}
-#endif /* CONFIG_MEMCG */
/**
- * bio_associate_blkcg - associate a bio with the specified blkcg
- * @bio: target bio
- * @blkcg_css: css of the blkcg to associate
+ * __bio_associate_blkg_from_css - internal blkg association function
*
- * Associate @bio with the blkcg specified by @blkcg_css. Block layer will
- * treat @bio as if it were issued by a task which belongs to the blkcg.
+ * This in the core association function that all association paths rely on.
+ * A blkg reference is taken which is released upon freeing of the bio.
+ */
+static int __bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ struct blkcg_gq *blkg;
+ int ret;
+
+ rcu_read_lock();
+
+ if (!css || !css->parent)
+ blkg = q->root_blkg;
+ else
+ blkg = blkg_lookup_create(css_to_blkcg(css), q);
+
+ ret = bio_associate_blkg(bio, blkg);
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * bio_associate_blkg_from_css - associate a bio with a specified css
+ * @bio: target bio
+ * @css: target css
*
- * This function takes an extra reference of @blkcg_css which will be put
- * when @bio is released. The caller must own @bio and is responsible for
- * synchronizing calls to this function.
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio. This falls back to the queue's root_blkg if
+ * the association fails with the css.
*/
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+int bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css)
{
- if (unlikely(bio->bi_css))
+ if (unlikely(bio->bi_blkg))
return -EBUSY;
- css_get(blkcg_css);
- bio->bi_css = blkcg_css;
- return 0;
+ return __bio_associate_blkg_from_css(bio, css);
}
-EXPORT_SYMBOL_GPL(bio_associate_blkcg);
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+#ifdef CONFIG_MEMCG
/**
- * bio_associate_blkg - associate a bio with the specified blkg
+ * bio_associate_blkg_from_page - associate a bio with the page's blkg
* @bio: target bio
- * @blkg: the blkg to associate
+ * @page: the page to lookup the blkcg from
*
- * Associate @bio with the blkg specified by @blkg. This is the queue specific
- * blkcg information associated with the @bio, a reference will be taken on the
- * @blkg and will be freed when the bio is freed.
+ * Associate @bio with the blkg from @page's owning memcg and the respective
+ * request_queue. If cgroup_e_css returns NULL, fall back to the queue's
+ * root_blkg.
+ *
+ * Note: this must be called after bio has an associated device.
*/
-int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+int bio_associate_blkg_from_page(struct bio *bio, struct page *page)
{
+ struct cgroup_subsys_state *css;
+ int ret;
+
if (unlikely(bio->bi_blkg))
return -EBUSY;
- if (!blkg_try_get(blkg))
- return -ENODEV;
- bio->bi_blkg = blkg;
- return 0;
+ if (!page->mem_cgroup)
+ return 0;
+
+ rcu_read_lock();
+
+ css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+
+ ret = __bio_associate_blkg_from_css(bio, css);
+
+ rcu_read_unlock();
+ return ret;
+}
+#endif /* CONFIG_MEMCG */
+
+/**
+ * bio_associate_create_blkg - associate a bio with a blkg from q
+ * @q: request_queue where bio is going
+ * @bio: target bio
+ *
+ * Associate @bio with the blkg found from the bio's css and the request_queue.
+ * If one is not found, bio_lookup_blkg creates the blkg. This falls back to
+ * the queue's root_blkg if association fails.
+ */
+int bio_associate_create_blkg(struct request_queue *q, struct bio *bio)
+{
+ struct cgroup_subsys_state *css;
+ int ret = 0;
+
+ /* someone has already associated this bio with a blkg */
+ if (bio->bi_blkg)
+ return ret;
+
+ rcu_read_lock();
+
+ css = blkcg_css();
+
+ ret = __bio_associate_blkg_from_css(bio, css);
+
+ rcu_read_unlock();
+ return ret;
}
/**
@@ -2031,10 +2093,6 @@ void bio_disassociate_task(struct bio *bio)
put_io_context(bio->bi_ioc);
bio->bi_ioc = NULL;
}
- if (bio->bi_css) {
- css_put(bio->bi_css);
- bio->bi_css = NULL;
- }
if (bio->bi_blkg) {
blkg_put(bio->bi_blkg);
bio->bi_blkg = NULL;
@@ -2042,16 +2100,16 @@ void bio_disassociate_task(struct bio *bio)
}
/**
- * bio_clone_blkcg_association - clone blkcg association from src to dst bio
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
* @dst: destination bio
* @src: source bio
*/
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
{
- if (src->bi_css)
- WARN_ON(bio_associate_blkcg(dst, src->bi_css));
+ if (src->bi_blkg)
+ bio_associate_blkg(dst, src->bi_blkg);
}
-EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
#endif /* CONFIG_BLK_CGROUP */
static void __init biovec_init_slabs(void)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c630e02836a8..992da5592c6e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -84,6 +84,37 @@ static void blkg_free(struct blkcg_gq *blkg)
kfree(blkg);
}
+static void __blkg_release(struct rcu_head *rcu)
+{
+ struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+
+ percpu_ref_exit(&blkg->refcnt);
+
+ /* release the blkcg and parent blkg refs this blkg has been holding */
+ css_put(&blkg->blkcg->css);
+ if (blkg->parent)
+ blkg_put(blkg->parent);
+
+ wb_congested_put(blkg->wb_congested);
+
+ blkg_free(blkg);
+}
+
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid. For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+static void blkg_release(struct percpu_ref *ref)
+{
+ struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
+
+ call_rcu(&blkg->rcu_head, __blkg_release);
+}
+
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
@@ -110,7 +141,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
blkg->blkcg = blkcg;
- atomic_set(&blkg->refcnt, 1);
/* root blkg uses @q->root_rl, init rl only for !root blkgs */
if (blkcg != &blkcg_root) {
@@ -217,6 +247,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_get(blkg->parent);
}
+ ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
+ GFP_NOWAIT | __GFP_NOWARN);
+ if (ret)
+ goto err_cancel_ref;
+
/* invoke per-policy init */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -249,6 +284,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_put(blkg);
return ERR_PTR(ret);
+err_cancel_ref:
+ percpu_ref_exit(&blkg->refcnt);
err_put_congested:
wb_congested_put(wb_congested);
err_put_css:
@@ -259,7 +296,7 @@ err_free_blkg:
}
/**
- * blkg_lookup_create - lookup blkg, try to create one if not there
+ * __blkg_lookup_create - lookup blkg, try to create one if not there
* @blkcg: blkcg of interest
* @q: request_queue of interest
*
@@ -268,12 +305,11 @@ err_free_blkg:
* that all non-root blkg's have access to the parent blkg. This function
* should be called under RCU read lock and @q->queue_lock.
*
- * Returns pointer to the looked up or created blkg on success, ERR_PTR()
- * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
- * dead and bypassing, returns ERR_PTR(-EBUSY).
+ * Returns the blkg or the closest blkg if blkg_create fails as it walks
+ * down from root.
*/
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q)
+struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+ struct request_queue *q)
{
struct blkcg_gq *blkg;
@@ -285,7 +321,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
* we shouldn't allow anything to go through for a bypassing queue.
*/
if (unlikely(blk_queue_bypass(q)))
- return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+ return q->root_blkg;
blkg = __blkg_lookup(blkcg, q, true);
if (blkg)
@@ -293,23 +329,58 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
/*
* Create blkgs walking down from blkcg_root to @blkcg, so that all
- * non-root blkgs have access to their parents.
+ * non-root blkgs have access to their parents. Returns the closest
+ * blkg to the intended blkg should blkg_create() fail.
*/
while (true) {
struct blkcg *pos = blkcg;
struct blkcg *parent = blkcg_parent(blkcg);
-
- while (parent && !__blkg_lookup(parent, q, false)) {
+ struct blkcg_gq *ret_blkg = q->root_blkg;
+
+ while (parent) {
+ blkg = __blkg_lookup(parent, q, false);
+ if (blkg) {
+ /* remember closest blkg */
+ ret_blkg = blkg;
+ break;
+ }
pos = parent;
parent = blkcg_parent(parent);
}
blkg = blkg_create(pos, q, NULL);
- if (pos == blkcg || IS_ERR(blkg))
+ if (IS_ERR(blkg))
+ return ret_blkg;
+ if (pos == blkcg)
return blkg;
}
}
+/**
+ * blkg_lookup_create - find or create a blkg
+ * @blkcg: target block cgroup
+ * @q: target request_queue
+ *
+ * This looks up or creates the blkg representing the unique pair
+ * of the blkcg and the request_queue.
+ */
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+ struct request_queue *q)
+{
+ struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
+ unsigned long flags;
+
+ if (unlikely(!blkg)) {
+ spin_lock_irqsave(q->queue_lock, flags);
+
+ blkg = __blkg_lookup_create(blkcg, q);
+
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ }
+
+ return blkg;
+}
+
static void blkg_destroy(struct blkcg_gq *blkg)
{
struct blkcg *blkcg = blkg->blkcg;
@@ -353,7 +424,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
* Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed.
*/
- blkg_put(blkg);
+ percpu_ref_kill(&blkg->refcnt);
}
/**
@@ -381,29 +452,6 @@ static void blkg_destroy_all(struct request_queue *q)
}
/*
- * A group is RCU protected, but having an rcu lock does not mean that one
- * can access all the fields of blkg and assume these are valid. For
- * example, don't try to follow throtl_data and request queue links.
- *
- * Having a reference to blkg under an rcu allows accesses to only values
- * local to groups like group stats and group rate limits.
- */
-void __blkg_release_rcu(struct rcu_head *rcu_head)
-{
- struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-
- /* release the blkcg and parent blkg refs this blkg has been holding */
- css_put(&blkg->blkcg->css);
- if (blkg->parent)
- blkg_put(blkg->parent);
-
- wb_congested_put(blkg->wb_congested);
-
- blkg_free(blkg);
-}
-EXPORT_SYMBOL_GPL(__blkg_release_rcu);
-
-/*
* The next function used by blk_queue_for_each_rl(). It's a bit tricky
* because the root blkg uses @q->root_rl instead of its own rl.
*/
@@ -1748,8 +1796,7 @@ void blkcg_maybe_throttle_current(void)
blkg = blkg_lookup(blkcg, q);
if (!blkg)
goto out;
- blkg = blkg_try_get(blkg);
- if (!blkg)
+ if (!blkg_tryget(blkg))
goto out;
rcu_read_unlock();
diff --git a/block/blk-core.c b/block/blk-core.c
index cff0a60ee200..f12d2b65e5a5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -42,6 +42,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
+#include "blk-pm.h"
#include "blk-rq-qos.h"
#ifdef CONFIG_DEBUG_FS
@@ -421,24 +422,25 @@ void blk_sync_queue(struct request_queue *q)
EXPORT_SYMBOL(blk_sync_queue);
/**
- * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
+ * blk_set_pm_only - increment pm_only counter
* @q: request queue pointer
- *
- * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
- * set and 1 if the flag was already set.
*/
-int blk_set_preempt_only(struct request_queue *q)
+void blk_set_pm_only(struct request_queue *q)
{
- return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
+ atomic_inc(&q->pm_only);
}
-EXPORT_SYMBOL_GPL(blk_set_preempt_only);
+EXPORT_SYMBOL_GPL(blk_set_pm_only);
-void blk_clear_preempt_only(struct request_queue *q)
+void blk_clear_pm_only(struct request_queue *q)
{
- blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
- wake_up_all(&q->mq_freeze_wq);
+ int pm_only;
+
+ pm_only = atomic_dec_return(&q->pm_only);
+ WARN_ON_ONCE(pm_only < 0);
+ if (pm_only == 0)
+ wake_up_all(&q->mq_freeze_wq);
}
-EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
+EXPORT_SYMBOL_GPL(blk_clear_pm_only);
/**
* __blk_run_queue_uncond - run a queue whether or not it has been stopped
@@ -917,7 +919,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
*/
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
- const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+ const bool pm = flags & BLK_MQ_REQ_PREEMPT;
while (true) {
bool success = false;
@@ -925,11 +927,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
rcu_read_lock();
if (percpu_ref_tryget_live(&q->q_usage_counter)) {
/*
- * The code that sets the PREEMPT_ONLY flag is
- * responsible for ensuring that that flag is globally
- * visible before the queue is unfrozen.
+ * The code that increments the pm_only counter is
+ * responsible for ensuring that that counter is
+ * globally visible before the queue is unfrozen.
*/
- if (preempt || !blk_queue_preempt_only(q)) {
+ if (pm || !blk_queue_pm_only(q)) {
success = true;
} else {
percpu_ref_put(&q->q_usage_counter);
@@ -954,7 +956,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
wait_event(q->mq_freeze_wq,
(atomic_read(&q->mq_freeze_depth) == 0 &&
- (preempt || !blk_queue_preempt_only(q))) ||
+ (pm || (blk_pm_request_resume(q),
+ !blk_queue_pm_only(q)))) ||
blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
@@ -1726,16 +1729,6 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
}
EXPORT_SYMBOL_GPL(part_round_stats);
-#ifdef CONFIG_PM
-static void blk_pm_put_request(struct request *rq)
-{
- if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
- pm_runtime_mark_last_busy(rq->q->dev);
-}
-#else
-static inline void blk_pm_put_request(struct request *rq) {}
-#endif
-
void __blk_put_request(struct request_queue *q, struct request *req)
{
req_flags_t rq_flags = req->rq_flags;
@@ -1752,6 +1745,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
+ blk_pm_mark_last_busy(req);
elv_completed_request(q, req);
@@ -2750,30 +2744,6 @@ void blk_account_io_done(struct request *req, u64 now)
}
}
-#ifdef CONFIG_PM
-/*
- * Don't process normal requests when queue is suspended
- * or in the process of suspending/resuming
- */
-static bool blk_pm_allow_request(struct request *rq)
-{
- switch (rq->q->rpm_status) {
- case RPM_RESUMING:
- case RPM_SUSPENDING:
- return rq->rq_flags & RQF_PM;
- case RPM_SUSPENDED:
- return false;
- default:
- return true;
- }
-}
-#else
-static bool blk_pm_allow_request(struct request *rq)
-{
- return true;
-}
-#endif
-
void blk_account_io_start(struct request *rq, bool new_io)
{
struct hd_struct *part;
@@ -2819,11 +2789,14 @@ static struct request *elv_next_request(struct request_queue *q)
while (1) {
list_for_each_entry(rq, &q->queue_head, queuelist) {
- if (blk_pm_allow_request(rq))
- return rq;
-
- if (rq->rq_flags & RQF_SOFTBARRIER)
- break;
+#ifdef CONFIG_PM
+ /*
+ * If a request gets queued in state RPM_SUSPENDED
+ * then that's a kernel bug.
+ */
+ WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED);
+#endif
+ return rq;
}
/*
@@ -3755,191 +3728,6 @@ void blk_finish_plug(struct blk_plug *plug)
}
EXPORT_SYMBOL(blk_finish_plug);
-#ifdef CONFIG_PM
-/**
- * blk_pm_runtime_init - Block layer runtime PM initialization routine
- * @q: the queue of the device
- * @dev: the device the queue belongs to
- *
- * Description:
- * Initialize runtime-PM-related fields for @q and start auto suspend for
- * @dev. Drivers that want to take advantage of request-based runtime PM
- * should call this function after @dev has been initialized, and its
- * request queue @q has been allocated, and runtime PM for it can not happen
- * yet(either due to disabled/forbidden or its usage_count > 0). In most
- * cases, driver should call this function before any I/O has taken place.
- *
- * This function takes care of setting up using auto suspend for the device,
- * the autosuspend delay is set to -1 to make runtime suspend impossible
- * until an updated value is either set by user or by driver. Drivers do
- * not need to touch other autosuspend settings.
- *
- * The block layer runtime PM is request based, so only works for drivers
- * that use request as their IO unit instead of those directly use bio's.
- */
-void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
-{
- /* Don't enable runtime PM for blk-mq until it is ready */
- if (q->mq_ops) {
- pm_runtime_disable(dev);
- return;
- }
-
- q->dev = dev;
- q->rpm_status = RPM_ACTIVE;
- pm_runtime_set_autosuspend_delay(q->dev, -1);
- pm_runtime_use_autosuspend(q->dev);
-}
-EXPORT_SYMBOL(blk_pm_runtime_init);
-
-/**
- * blk_pre_runtime_suspend - Pre runtime suspend check
- * @q: the queue of the device
- *
- * Description:
- * This function will check if runtime suspend is allowed for the device
- * by examining if there are any requests pending in the queue. If there
- * are requests pending, the device can not be runtime suspended; otherwise,
- * the queue's status will be updated to SUSPENDING and the driver can
- * proceed to suspend the device.
- *
- * For the not allowed case, we mark last busy for the device so that
- * runtime PM core will try to autosuspend it some time later.
- *
- * This function should be called near the start of the device's
- * runtime_suspend callback.
- *
- * Return:
- * 0 - OK to runtime suspend the device
- * -EBUSY - Device should not be runtime suspended
- */
-int blk_pre_runtime_suspend(struct request_queue *q)
-{
- int ret = 0;
-
- if (!q->dev)
- return ret;
-
- spin_lock_irq(q->queue_lock);
- if (q->nr_pending) {
- ret = -EBUSY;
- pm_runtime_mark_last_busy(q->dev);
- } else {
- q->rpm_status = RPM_SUSPENDING;
- }
- spin_unlock_irq(q->queue_lock);
- return ret;
-}
-EXPORT_SYMBOL(blk_pre_runtime_suspend);
-
-/**
- * blk_post_runtime_suspend - Post runtime suspend processing
- * @q: the queue of the device
- * @err: return value of the device's runtime_suspend function
- *
- * Description:
- * Update the queue's runtime status according to the return value of the
- * device's runtime suspend function and mark last busy for the device so
- * that PM core will try to auto suspend the device at a later time.
- *
- * This function should be called near the end of the device's
- * runtime_suspend callback.
- */
-void blk_post_runtime_suspend(struct request_queue *q, int err)
-{
- if (!q->dev)
- return;
-
- spin_lock_irq(q->queue_lock);
- if (!err) {
- q->rpm_status = RPM_SUSPENDED;
- } else {
- q->rpm_status = RPM_ACTIVE;
- pm_runtime_mark_last_busy(q->dev);
- }
- spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_post_runtime_suspend);
-
-/**
- * blk_pre_runtime_resume - Pre runtime resume processing
- * @q: the queue of the device
- *
- * Description:
- * Update the queue's runtime status to RESUMING in preparation for the
- * runtime resume of the device.
- *
- * This function should be called near the start of the device's
- * runtime_resume callback.
- */
-void blk_pre_runtime_resume(struct request_queue *q)
-{
- if (!q->dev)
- return;
-
- spin_lock_irq(q->queue_lock);
- q->rpm_status = RPM_RESUMING;
- spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_pre_runtime_resume);
-
-/**
- * blk_post_runtime_resume - Post runtime resume processing
- * @q: the queue of the device
- * @err: return value of the device's runtime_resume function
- *
- * Description:
- * Update the queue's runtime status according to the return value of the
- * device's runtime_resume function. If it is successfully resumed, process
- * the requests that are queued into the device's queue when it is resuming
- * and then mark last busy and initiate autosuspend for it.
- *
- * This function should be called near the end of the device's
- * runtime_resume callback.
- */
-void blk_post_runtime_resume(struct request_queue *q, int err)
-{
- if (!q->dev)
- return;
-
- spin_lock_irq(q->queue_lock);
- if (!err) {
- q->rpm_status = RPM_ACTIVE;
- __blk_run_queue(q);
- pm_runtime_mark_last_busy(q->dev);
- pm_request_autosuspend(q->dev);
- } else {
- q->rpm_status = RPM_SUSPENDED;
- }
- spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_post_runtime_resume);
-
-/**
- * blk_set_runtime_active - Force runtime status of the queue to be active
- * @q: the queue of the device
- *
- * If the device is left runtime suspended during system suspend the resume
- * hook typically resumes the device and corrects runtime status
- * accordingly. However, that does not affect the queue runtime PM status
- * which is still "suspended". This prevents processing requests from the
- * queue.
- *
- * This function can be used in driver's resume hook to correct queue
- * runtime PM status and re-enable peeking requests from the queue. It
- * should be called before first request is added to the queue.
- */
-void blk_set_runtime_active(struct request_queue *q)
-{
- spin_lock_irq(q->queue_lock);
- q->rpm_status = RPM_ACTIVE;
- pm_runtime_mark_last_busy(q->dev);
- pm_request_autosuspend(q->dev);
- spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_set_runtime_active);
-#endif
-
int __init blk_dev_init(void)
{
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 6121611e1316..d1ab089e0919 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -49,12 +49,8 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
bio_for_each_integrity_vec(iv, bio, iter) {
if (prev) {
- if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+ if (!biovec_phys_mergeable(q, &ivprv, &iv))
goto new_segment;
-
- if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
- goto new_segment;
-
if (seg_size + iv.bv_len > queue_max_segment_size(q))
goto new_segment;
@@ -95,12 +91,8 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
bio_for_each_integrity_vec(iv, bio, iter) {
if (prev) {
- if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+ if (!biovec_phys_mergeable(q, &ivprv, &iv))
goto new_segment;
-
- if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
- goto new_segment;
-
if (sg->length + iv.bv_len > queue_max_segment_size(q))
goto new_segment;
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 19923f8a029d..35c48d7b8f78 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -115,9 +115,22 @@ struct child_latency_info {
atomic_t scale_cookie;
};
+struct percentile_stats {
+ u64 total;
+ u64 missed;
+};
+
+struct latency_stat {
+ union {
+ struct percentile_stats ps;
+ struct blk_rq_stat rqs;
+ };
+};
+
struct iolatency_grp {
struct blkg_policy_data pd;
- struct blk_rq_stat __percpu *stats;
+ struct latency_stat __percpu *stats;
+ struct latency_stat cur_stat;
struct blk_iolatency *blkiolat;
struct rq_depth rq_depth;
struct rq_wait rq_wait;
@@ -132,6 +145,7 @@ struct iolatency_grp {
/* Our current number of IO's for the last summation. */
u64 nr_samples;
+ bool ssd;
struct child_latency_info child_lat;
};
@@ -172,6 +186,80 @@ static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
return pd_to_blkg(&iolat->pd);
}
+static inline void latency_stat_init(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
+{
+ if (iolat->ssd) {
+ stat->ps.total = 0;
+ stat->ps.missed = 0;
+ } else
+ blk_rq_stat_init(&stat->rqs);
+}
+
+static inline void latency_stat_sum(struct iolatency_grp *iolat,
+ struct latency_stat *sum,
+ struct latency_stat *stat)
+{
+ if (iolat->ssd) {
+ sum->ps.total += stat->ps.total;
+ sum->ps.missed += stat->ps.missed;
+ } else
+ blk_rq_stat_sum(&sum->rqs, &stat->rqs);
+}
+
+static inline void latency_stat_record_time(struct iolatency_grp *iolat,
+ u64 req_time)
+{
+ struct latency_stat *stat = get_cpu_ptr(iolat->stats);
+ if (iolat->ssd) {
+ if (req_time >= iolat->min_lat_nsec)
+ stat->ps.missed++;
+ stat->ps.total++;
+ } else
+ blk_rq_stat_add(&stat->rqs, req_time);
+ put_cpu_ptr(stat);
+}
+
+static inline bool latency_sum_ok(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
+{
+ if (iolat->ssd) {
+ u64 thresh = div64_u64(stat->ps.total, 10);
+ thresh = max(thresh, 1ULL);
+ return stat->ps.missed < thresh;
+ }
+ return stat->rqs.mean <= iolat->min_lat_nsec;
+}
+
+static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
+{
+ if (iolat->ssd)
+ return stat->ps.total;
+ return stat->rqs.nr_samples;
+}
+
+static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
+{
+ int exp_idx;
+
+ if (iolat->ssd)
+ return;
+
+ /*
+ * CALC_LOAD takes in a number stored in fixed point representation.
+ * Because we are using this for IO time in ns, the values stored
+ * are significantly larger than the FIXED_1 denominator (2048).
+ * Therefore, rounding errors in the calculation are negligible and
+ * can be ignored.
+ */
+ exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
+ div64_u64(iolat->cur_win_nsec,
+ BLKIOLATENCY_EXP_BUCKET_SIZE));
+ CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean);
+}
+
static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
wait_queue_entry_t *wait,
bool first_block)
@@ -255,7 +343,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
struct child_latency_info *lat_info,
bool up)
{
- unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
+ unsigned long qd = blkiolat->rqos.q->nr_requests;
unsigned long scale = scale_amount(qd, up);
unsigned long old = atomic_read(&lat_info->scale_cookie);
unsigned long max_scale = qd << 1;
@@ -295,10 +383,9 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
*/
static void scale_change(struct iolatency_grp *iolat, bool up)
{
- unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
+ unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
unsigned long scale = scale_amount(qd, up);
unsigned long old = iolat->rq_depth.max_depth;
- bool changed = false;
if (old > qd)
old = qd;
@@ -308,15 +395,13 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
return;
if (old < qd) {
- changed = true;
old += scale;
old = min(old, qd);
iolat->rq_depth.max_depth = old;
wake_up_all(&iolat->rq_wait.wait);
}
- } else if (old > 1) {
+ } else {
old >>= 1;
- changed = true;
iolat->rq_depth.max_depth = max(old, 1UL);
}
}
@@ -369,7 +454,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
* scale down event.
*/
samples_thresh = lat_info->nr_samples * 5;
- samples_thresh = div64_u64(samples_thresh, 100);
+ samples_thresh = max(1ULL, div64_u64(samples_thresh, 100));
if (iolat->nr_samples <= samples_thresh)
return;
}
@@ -395,34 +480,12 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
spinlock_t *lock)
{
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
- struct blkcg *blkcg;
- struct blkcg_gq *blkg;
- struct request_queue *q = rqos->q;
+ struct blkcg_gq *blkg = bio->bi_blkg;
bool issue_as_root = bio_issue_as_root_blkg(bio);
if (!blk_iolatency_enabled(blkiolat))
return;
- rcu_read_lock();
- blkcg = bio_blkcg(bio);
- bio_associate_blkcg(bio, &blkcg->css);
- blkg = blkg_lookup(blkcg, q);
- if (unlikely(!blkg)) {
- if (!lock)
- spin_lock_irq(q->queue_lock);
- blkg = blkg_lookup_create(blkcg, q);
- if (IS_ERR(blkg))
- blkg = NULL;
- if (!lock)
- spin_unlock_irq(q->queue_lock);
- }
- if (!blkg)
- goto out;
-
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
- bio_associate_blkg(bio, blkg);
-out:
- rcu_read_unlock();
while (blkg && blkg->parent) {
struct iolatency_grp *iolat = blkg_to_lat(blkg);
if (!iolat) {
@@ -443,7 +506,6 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
struct bio_issue *issue, u64 now,
bool issue_as_root)
{
- struct blk_rq_stat *rq_stat;
u64 start = bio_issue_time(issue);
u64 req_time;
@@ -469,9 +531,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
return;
}
- rq_stat = get_cpu_ptr(iolat->stats);
- blk_rq_stat_add(rq_stat, req_time);
- put_cpu_ptr(rq_stat);
+ latency_stat_record_time(iolat, req_time);
}
#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
@@ -482,17 +542,17 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
struct blkcg_gq *blkg = lat_to_blkg(iolat);
struct iolatency_grp *parent;
struct child_latency_info *lat_info;
- struct blk_rq_stat stat;
+ struct latency_stat stat;
unsigned long flags;
- int cpu, exp_idx;
+ int cpu;
- blk_rq_stat_init(&stat);
+ latency_stat_init(iolat, &stat);
preempt_disable();
for_each_online_cpu(cpu) {
- struct blk_rq_stat *s;
+ struct latency_stat *s;
s = per_cpu_ptr(iolat->stats, cpu);
- blk_rq_stat_sum(&stat, s);
- blk_rq_stat_init(s);
+ latency_stat_sum(iolat, &stat, s);
+ latency_stat_init(iolat, s);
}
preempt_enable();
@@ -502,41 +562,36 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
lat_info = &parent->child_lat;
- /*
- * CALC_LOAD takes in a number stored in fixed point representation.
- * Because we are using this for IO time in ns, the values stored
- * are significantly larger than the FIXED_1 denominator (2048).
- * Therefore, rounding errors in the calculation are negligible and
- * can be ignored.
- */
- exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
- div64_u64(iolat->cur_win_nsec,
- BLKIOLATENCY_EXP_BUCKET_SIZE));
- CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
+ iolat_update_total_lat_avg(iolat, &stat);
/* Everything is ok and we don't need to adjust the scale. */
- if (stat.mean <= iolat->min_lat_nsec &&
+ if (latency_sum_ok(iolat, &stat) &&
atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
return;
/* Somebody beat us to the punch, just bail. */
spin_lock_irqsave(&lat_info->lock, flags);
+
+ latency_stat_sum(iolat, &iolat->cur_stat, &stat);
lat_info->nr_samples -= iolat->nr_samples;
- lat_info->nr_samples += stat.nr_samples;
- iolat->nr_samples = stat.nr_samples;
+ lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
+ iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
if ((lat_info->last_scale_event >= now ||
- now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
- lat_info->scale_lat <= iolat->min_lat_nsec)
+ now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME))
goto out;
- if (stat.mean <= iolat->min_lat_nsec &&
- stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
+ if (latency_sum_ok(iolat, &iolat->cur_stat) &&
+ latency_sum_ok(iolat, &stat)) {
+ if (latency_stat_samples(iolat, &iolat->cur_stat) <
+ BLKIOLATENCY_MIN_GOOD_SAMPLES)
+ goto out;
if (lat_info->scale_grp == iolat) {
lat_info->last_scale_event = now;
scale_cookie_change(iolat->blkiolat, lat_info, true);
}
- } else if (stat.mean > iolat->min_lat_nsec) {
+ } else if (lat_info->scale_lat == 0 ||
+ lat_info->scale_lat >= iolat->min_lat_nsec) {
lat_info->last_scale_event = now;
if (!lat_info->scale_grp ||
lat_info->scale_lat > iolat->min_lat_nsec) {
@@ -545,6 +600,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
}
scale_cookie_change(iolat->blkiolat, lat_info, false);
}
+ latency_stat_init(iolat, &iolat->cur_stat);
out:
spin_unlock_irqrestore(&lat_info->lock, flags);
}
@@ -650,7 +706,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
* We could be exiting, don't access the pd unless we have a
* ref on the blkg.
*/
- if (!blkg_try_get(blkg))
+ if (!blkg_tryget(blkg))
continue;
iolat = blkg_to_lat(blkg);
@@ -761,7 +817,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
{
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkcg_gq *blkg;
- struct blk_iolatency *blkiolat;
struct blkg_conf_ctx ctx;
struct iolatency_grp *iolat;
char *p, *tok;
@@ -774,7 +829,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
return ret;
iolat = blkg_to_lat(ctx.blkg);
- blkiolat = iolat->blkiolat;
p = ctx.body;
ret = -EINVAL;
@@ -835,13 +889,43 @@ static int iolatency_print_limit(struct seq_file *sf, void *v)
return 0;
}
+static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
+ size_t size)
+{
+ struct latency_stat stat;
+ int cpu;
+
+ latency_stat_init(iolat, &stat);
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ struct latency_stat *s;
+ s = per_cpu_ptr(iolat->stats, cpu);
+ latency_stat_sum(iolat, &stat, s);
+ }
+ preempt_enable();
+
+ if (iolat->rq_depth.max_depth == UINT_MAX)
+ return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total);
+ return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total,
+ iolat->rq_depth.max_depth);
+}
+
static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
size_t size)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
- unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
- unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
+ unsigned long long avg_lat;
+ unsigned long long cur_win;
+
+ if (iolat->ssd)
+ return iolatency_ssd_stat(iolat, buf, size);
+ avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
+ cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
if (iolat->rq_depth.max_depth == UINT_MAX)
return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
avg_lat, cur_win);
@@ -858,8 +942,8 @@ static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
iolat = kzalloc_node(sizeof(*iolat), gfp, node);
if (!iolat)
return NULL;
- iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
- __alignof__(struct blk_rq_stat), gfp);
+ iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
+ __alignof__(struct latency_stat), gfp);
if (!iolat->stats) {
kfree(iolat);
return NULL;
@@ -876,15 +960,21 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
u64 now = ktime_to_ns(ktime_get());
int cpu;
+ if (blk_queue_nonrot(blkg->q))
+ iolat->ssd = true;
+ else
+ iolat->ssd = false;
+
for_each_possible_cpu(cpu) {
- struct blk_rq_stat *stat;
+ struct latency_stat *stat;
stat = per_cpu_ptr(iolat->stats, cpu);
- blk_rq_stat_init(stat);
+ latency_stat_init(iolat, stat);
}
+ latency_stat_init(iolat, &iolat->cur_stat);
rq_wait_init(&iolat->rq_wait);
spin_lock_init(&iolat->child_lat.lock);
- iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
+ iolat->rq_depth.queue_depth = blkg->q->nr_requests;
iolat->rq_depth.max_depth = UINT_MAX;
iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
iolat->blkiolat = blkiolat;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index aaec38cc37b8..42a46744c11b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,6 +12,69 @@
#include "blk.h"
+/*
+ * Check if the two bvecs from two bios can be merged to one segment. If yes,
+ * no need to check gap between the two bios since the 1st bio and the 1st bvec
+ * in the 2nd bio can be handled in one segment.
+ */
+static inline bool bios_segs_mergeable(struct request_queue *q,
+ struct bio *prev, struct bio_vec *prev_last_bv,
+ struct bio_vec *next_first_bv)
+{
+ if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv))
+ return false;
+ if (prev->bi_seg_back_size + next_first_bv->bv_len >
+ queue_max_segment_size(q))
+ return false;
+ return true;
+}
+
+static inline bool bio_will_gap(struct request_queue *q,
+ struct request *prev_rq, struct bio *prev, struct bio *next)
+{
+ struct bio_vec pb, nb;
+
+ if (!bio_has_data(prev) || !queue_virt_boundary(q))
+ return false;
+
+ /*
+ * Don't merge if the 1st bio starts with non-zero offset, otherwise it
+ * is quite difficult to respect the sg gap limit. We work hard to
+ * merge a huge number of small single bios in case of mkfs.
+ */
+ if (prev_rq)
+ bio_get_first_bvec(prev_rq->bio, &pb);
+ else
+ bio_get_first_bvec(prev, &pb);
+ if (pb.bv_offset)
+ return true;
+
+ /*
+ * We don't need to worry about the situation that the merged segment
+ * ends in unaligned virt boundary:
+ *
+ * - if 'pb' ends aligned, the merged segment ends aligned
+ * - if 'pb' ends unaligned, the next bio must include
+ * one single bvec of 'nb', otherwise the 'nb' can't
+ * merge with 'pb'
+ */
+ bio_get_last_bvec(prev, &pb);
+ bio_get_first_bvec(next, &nb);
+ if (bios_segs_mergeable(q, prev, &pb, &nb))
+ return false;
+ return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+}
+
+static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
+{
+ return bio_will_gap(req->q, req, req->biotail, bio);
+}
+
+static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
+{
+ return bio_will_gap(req->q, NULL, bio, req->bio);
+}
+
static struct bio *blk_bio_discard_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -134,9 +197,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
if (bvprvp && blk_queue_cluster(q)) {
if (seg_size + bv.bv_len > queue_max_segment_size(q))
goto new_segment;
- if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv))
- goto new_segment;
- if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
+ if (!biovec_phys_mergeable(q, bvprvp, &bv))
goto new_segment;
seg_size += bv.bv_len;
@@ -267,9 +328,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
goto new_segment;
- if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
- goto new_segment;
- if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
+ if (!biovec_phys_mergeable(q, &bvprv, &bv))
goto new_segment;
seg_size += bv.bv_len;
@@ -349,17 +408,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
bio_get_last_bvec(bio, &end_bv);
bio_get_first_bvec(nxt, &nxt_bv);
- if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
- return 0;
-
- /*
- * bio and nxt are contiguous in memory; check if the queue allows
- * these two to be merged into one
- */
- if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
- return 1;
-
- return 0;
+ return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
}
static inline void
@@ -373,10 +422,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
if (*sg && *cluster) {
if ((*sg)->length + nbytes > queue_max_segment_size(q))
goto new_segment;
-
- if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
- goto new_segment;
- if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
+ if (!biovec_phys_mergeable(q, bvprv, bvec))
goto new_segment;
(*sg)->length += nbytes;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index cb1e6cf7ac48..a5ea86835fcb 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -102,6 +102,14 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags,
return 0;
}
+static int queue_pm_only_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+
+ seq_printf(m, "%d\n", atomic_read(&q->pm_only));
+ return 0;
+}
+
#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(QUEUED),
@@ -132,7 +140,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
QUEUE_FLAG_NAME(QUIESCED),
- QUEUE_FLAG_NAME(PREEMPT_ONLY),
};
#undef QUEUE_FLAG_NAME
@@ -209,6 +216,7 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "poll_stat", 0400, queue_poll_stat_show },
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
+ { "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
{ "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 4e028ee42430..8a9544203173 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -49,12 +49,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
return true;
}
-static inline void blk_mq_sched_completed_request(struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
struct elevator_queue *e = rq->q->elevator;
if (e && e->type->ops.mq.completed_request)
- e->type->ops.mq.completed_request(rq);
+ e->type->ops.mq.completed_request(rq, now);
}
static inline void blk_mq_sched_started_request(struct request *rq)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 41317c50a446..cfda95b85d34 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -232,13 +232,26 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
/*
* We can hit rq == NULL here, because the tagging functions
- * test and set the bit before assining ->rqs[].
+ * test and set the bit before assigning ->rqs[].
*/
if (rq && rq->q == hctx->queue)
iter_data->fn(hctx, rq, iter_data->data, reserved);
return true;
}
+/**
+ * bt_for_each - iterate over the requests associated with a hardware queue
+ * @hctx: Hardware queue to examine.
+ * @bt: sbitmap to examine. This is either the breserved_tags member
+ * or the bitmap_tags member of struct blk_mq_tags.
+ * @fn: Pointer to the function that will be called for each request
+ * associated with @hctx that has been assigned a driver tag.
+ * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
+ * where rq is a pointer to a request.
+ * @data: Will be passed as third argument to @fn.
+ * @reserved: Indicates whether @bt is the breserved_tags member or the
+ * bitmap_tags member of struct blk_mq_tags.
+ */
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
busy_iter_fn *fn, void *data, bool reserved)
{
@@ -280,6 +293,18 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
return true;
}
+/**
+ * bt_tags_for_each - iterate over the requests in a tag map
+ * @tags: Tag map to iterate over.
+ * @bt: sbitmap to examine. This is either the breserved_tags member
+ * or the bitmap_tags member of struct blk_mq_tags.
+ * @fn: Pointer to the function that will be called for each started
+ * request. @fn will be called as follows: @fn(rq, @data,
+ * @reserved) where rq is a pointer to a request.
+ * @data: Will be passed as second argument to @fn.
+ * @reserved: Indicates whether @bt is the breserved_tags member or the
+ * bitmap_tags member of struct blk_mq_tags.
+ */
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
busy_tag_iter_fn *fn, void *data, bool reserved)
{
@@ -294,6 +319,15 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}
+/**
+ * blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map
+ * @tags: Tag map to iterate over.
+ * @fn: Pointer to the function that will be called for each started
+ * request. @fn will be called as follows: @fn(rq, @priv,
+ * reserved) where rq is a pointer to a request. 'reserved'
+ * indicates whether or not @rq is a reserved request.
+ * @priv: Will be passed as second argument to @fn.
+ */
static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
busy_tag_iter_fn *fn, void *priv)
{
@@ -302,6 +336,15 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
}
+/**
+ * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
+ * @tagset: Tag set to iterate over.
+ * @fn: Pointer to the function that will be called for each started
+ * request. @fn will be called as follows: @fn(rq, @priv,
+ * reserved) where rq is a pointer to a request. 'reserved'
+ * indicates whether or not @rq is a reserved request.
+ * @priv: Will be passed as second argument to @fn.
+ */
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
@@ -314,6 +357,20 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
+/**
+ * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
+ * @q: Request queue to examine.
+ * @fn: Pointer to the function that will be called for each request
+ * on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
+ * reserved) where rq is a pointer to a request and hctx points
+ * to the hardware queue associated with the request. 'reserved'
+ * indicates whether or not @rq is a reserved request.
+ * @priv: Will be passed as third argument to @fn.
+ *
+ * Note: if @q->tag_set is shared with other request queues then @fn will be
+ * called for all requests on all queues that share that tag set and not only
+ * for requests associated with @q.
+ */
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
{
@@ -321,9 +378,11 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
int i;
/*
- * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
- * queue_hw_ctx after freeze the queue, so we use q_usage_counter
- * to avoid race with it.
+ * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
+ * while the queue is frozen. So we can use q_usage_counter to avoid
+ * racing with it. __blk_mq_update_nr_hw_queues() uses
+ * synchronize_rcu() to ensure this function left the critical section
+ * below.
*/
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
@@ -332,7 +391,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
struct blk_mq_tags *tags = hctx->tags;
/*
- * If not software queues are currently mapped to this
+ * If no software queues are currently mapped to this
* hardware queue, there's nothing to check
*/
if (!blk_mq_hw_queue_mapped(hctx))
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e3c39ea8e17b..1dc157c85a83 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -33,6 +33,7 @@
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
+#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
@@ -198,7 +199,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
WARN_ON_ONCE(freeze_depth < 0);
if (!freeze_depth) {
- percpu_ref_reinit(&q->q_usage_counter);
+ percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
}
@@ -475,6 +476,7 @@ static void __blk_mq_free_request(struct request *rq)
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
const int sched_tag = rq->internal_tag;
+ blk_pm_mark_last_busy(rq);
if (rq->tag != -1)
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
@@ -526,6 +528,9 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
+ if (rq->internal_tag != -1)
+ blk_mq_sched_completed_request(rq, now);
+
blk_account_io_done(rq, now);
if (rq->end_io) {
@@ -562,8 +567,6 @@ static void __blk_mq_complete_request(struct request *rq)
if (!blk_mq_mark_complete(rq))
return;
- if (rq->internal_tag != -1)
- blk_mq_sched_completed_request(rq);
if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
rq->q->softirq_done_fn(rq);
diff --git a/block/blk-pm.c b/block/blk-pm.c
new file mode 100644
index 000000000000..f8fdae01bea2
--- /dev/null
+++ b/block/blk-pm.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blk-mq.h>
+#include <linux/blk-pm.h>
+#include <linux/blkdev.h>
+#include <linux/pm_runtime.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+/**
+ * blk_pm_runtime_init - Block layer runtime PM initialization routine
+ * @q: the queue of the device
+ * @dev: the device the queue belongs to
+ *
+ * Description:
+ * Initialize runtime-PM-related fields for @q and start auto suspend for
+ * @dev. Drivers that want to take advantage of request-based runtime PM
+ * should call this function after @dev has been initialized, and its
+ * request queue @q has been allocated, and runtime PM for it can not happen
+ * yet(either due to disabled/forbidden or its usage_count > 0). In most
+ * cases, driver should call this function before any I/O has taken place.
+ *
+ * This function takes care of setting up using auto suspend for the device,
+ * the autosuspend delay is set to -1 to make runtime suspend impossible
+ * until an updated value is either set by user or by driver. Drivers do
+ * not need to touch other autosuspend settings.
+ *
+ * The block layer runtime PM is request based, so only works for drivers
+ * that use request as their IO unit instead of those directly use bio's.
+ */
+void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
+{
+ q->dev = dev;
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_set_autosuspend_delay(q->dev, -1);
+ pm_runtime_use_autosuspend(q->dev);
+}
+EXPORT_SYMBOL(blk_pm_runtime_init);
+
+/**
+ * blk_pre_runtime_suspend - Pre runtime suspend check
+ * @q: the queue of the device
+ *
+ * Description:
+ * This function will check if runtime suspend is allowed for the device
+ * by examining if there are any requests pending in the queue. If there
+ * are requests pending, the device can not be runtime suspended; otherwise,
+ * the queue's status will be updated to SUSPENDING and the driver can
+ * proceed to suspend the device.
+ *
+ * For the not allowed case, we mark last busy for the device so that
+ * runtime PM core will try to autosuspend it some time later.
+ *
+ * This function should be called near the start of the device's
+ * runtime_suspend callback.
+ *
+ * Return:
+ * 0 - OK to runtime suspend the device
+ * -EBUSY - Device should not be runtime suspended
+ */
+int blk_pre_runtime_suspend(struct request_queue *q)
+{
+ int ret = 0;
+
+ if (!q->dev)
+ return ret;
+
+ WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE);
+
+ /*
+ * Increase the pm_only counter before checking whether any
+ * non-PM blk_queue_enter() calls are in progress to avoid that any
+ * new non-PM blk_queue_enter() calls succeed before the pm_only
+ * counter is decreased again.
+ */
+ blk_set_pm_only(q);
+ ret = -EBUSY;
+ /* Switch q_usage_counter from per-cpu to atomic mode. */
+ blk_freeze_queue_start(q);
+ /*
+ * Wait until atomic mode has been reached. Since that
+ * involves calling call_rcu(), it is guaranteed that later
+ * blk_queue_enter() calls see the pm-only state. See also
+ * http://lwn.net/Articles/573497/.
+ */
+ percpu_ref_switch_to_atomic_sync(&q->q_usage_counter);
+ if (percpu_ref_is_zero(&q->q_usage_counter))
+ ret = 0;
+ /* Switch q_usage_counter back to per-cpu mode. */
+ blk_mq_unfreeze_queue(q);
+
+ spin_lock_irq(q->queue_lock);
+ if (ret < 0)
+ pm_runtime_mark_last_busy(q->dev);
+ else
+ q->rpm_status = RPM_SUSPENDING;
+ spin_unlock_irq(q->queue_lock);
+
+ if (ret)
+ blk_clear_pm_only(q);
+
+ return ret;
+}
+EXPORT_SYMBOL(blk_pre_runtime_suspend);
+
+/**
+ * blk_post_runtime_suspend - Post runtime suspend processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_suspend function
+ *
+ * Description:
+ * Update the queue's runtime status according to the return value of the
+ * device's runtime suspend function and mark last busy for the device so
+ * that PM core will try to auto suspend the device at a later time.
+ *
+ * This function should be called near the end of the device's
+ * runtime_suspend callback.
+ */
+void blk_post_runtime_suspend(struct request_queue *q, int err)
+{
+ if (!q->dev)
+ return;
+
+ spin_lock_irq(q->queue_lock);
+ if (!err) {
+ q->rpm_status = RPM_SUSPENDED;
+ } else {
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_mark_last_busy(q->dev);
+ }
+ spin_unlock_irq(q->queue_lock);
+
+ if (err)
+ blk_clear_pm_only(q);
+}
+EXPORT_SYMBOL(blk_post_runtime_suspend);
+
+/**
+ * blk_pre_runtime_resume - Pre runtime resume processing
+ * @q: the queue of the device
+ *
+ * Description:
+ * Update the queue's runtime status to RESUMING in preparation for the
+ * runtime resume of the device.
+ *
+ * This function should be called near the start of the device's
+ * runtime_resume callback.
+ */
+void blk_pre_runtime_resume(struct request_queue *q)
+{
+ if (!q->dev)
+ return;
+
+ spin_lock_irq(q->queue_lock);
+ q->rpm_status = RPM_RESUMING;
+ spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_pre_runtime_resume);
+
+/**
+ * blk_post_runtime_resume - Post runtime resume processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_resume function
+ *
+ * Description:
+ * Update the queue's runtime status according to the return value of the
+ * device's runtime_resume function. If it is successfully resumed, process
+ * the requests that are queued into the device's queue when it is resuming
+ * and then mark last busy and initiate autosuspend for it.
+ *
+ * This function should be called near the end of the device's
+ * runtime_resume callback.
+ */
+void blk_post_runtime_resume(struct request_queue *q, int err)
+{
+ if (!q->dev)
+ return;
+
+ spin_lock_irq(q->queue_lock);
+ if (!err) {
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_mark_last_busy(q->dev);
+ pm_request_autosuspend(q->dev);
+ } else {
+ q->rpm_status = RPM_SUSPENDED;
+ }
+ spin_unlock_irq(q->queue_lock);
+
+ if (!err)
+ blk_clear_pm_only(q);
+}
+EXPORT_SYMBOL(blk_post_runtime_resume);
+
+/**
+ * blk_set_runtime_active - Force runtime status of the queue to be active
+ * @q: the queue of the device
+ *
+ * If the device is left runtime suspended during system suspend the resume
+ * hook typically resumes the device and corrects runtime status
+ * accordingly. However, that does not affect the queue runtime PM status
+ * which is still "suspended". This prevents processing requests from the
+ * queue.
+ *
+ * This function can be used in driver's resume hook to correct queue
+ * runtime PM status and re-enable peeking requests from the queue. It
+ * should be called before first request is added to the queue.
+ */
+void blk_set_runtime_active(struct request_queue *q)
+{
+ spin_lock_irq(q->queue_lock);
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_mark_last_busy(q->dev);
+ pm_request_autosuspend(q->dev);
+ spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-pm.h b/block/blk-pm.h
new file mode 100644
index 000000000000..a8564ea72a41
--- /dev/null
+++ b/block/blk-pm.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BLOCK_BLK_PM_H_
+#define _BLOCK_BLK_PM_H_
+
+#include <linux/pm_runtime.h>
+
+#ifdef CONFIG_PM
+static inline void blk_pm_request_resume(struct request_queue *q)
+{
+ if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
+ q->rpm_status == RPM_SUSPENDING))
+ pm_request_resume(q->dev);
+}
+
+static inline void blk_pm_mark_last_busy(struct request *rq)
+{
+ if (rq->q->dev && !(rq->rq_flags & RQF_PM))
+ pm_runtime_mark_last_busy(rq->q->dev);
+}
+
+static inline void blk_pm_requeue_request(struct request *rq)
+{
+ lockdep_assert_held(rq->q->queue_lock);
+
+ if (rq->q->dev && !(rq->rq_flags & RQF_PM))
+ rq->q->nr_pending--;
+}
+
+static inline void blk_pm_add_request(struct request_queue *q,
+ struct request *rq)
+{
+ lockdep_assert_held(q->queue_lock);
+
+ if (q->dev && !(rq->rq_flags & RQF_PM))
+ q->nr_pending++;
+}
+
+static inline void blk_pm_put_request(struct request *rq)
+{
+ lockdep_assert_held(rq->q->queue_lock);
+
+ if (rq->q->dev && !(rq->rq_flags & RQF_PM))
+ --rq->q->nr_pending;
+}
+#else
+static inline void blk_pm_request_resume(struct request_queue *q)
+{
+}
+
+static inline void blk_pm_mark_last_busy(struct request *rq)
+{
+}
+
+static inline void blk_pm_requeue_request(struct request *rq)
+{
+}
+
+static inline void blk_pm_add_request(struct request_queue *q,
+ struct request *rq)
+{
+}
+
+static inline void blk_pm_put_request(struct request *rq)
+{
+}
+#endif
+
+#endif /* _BLOCK_BLK_PM_H_ */
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 7587b1c3caaf..90561af85a62 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -190,6 +190,7 @@ void blk_stat_enable_accounting(struct request_queue *q)
blk_queue_flag_set(QUEUE_FLAG_STATS, q);
spin_unlock(&q->stats->lock);
}
+EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
struct blk_queue_stats *blk_alloc_queue_stats(void)
{
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 01d0620a4e4a..4bda70e8db48 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -84,8 +84,7 @@ struct throtl_service_queue {
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
- struct rb_root pending_tree; /* RB tree of active tgs */
- struct rb_node *first_pending; /* first node in the tree */
+ struct rb_root_cached pending_tree; /* RB tree of active tgs */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
@@ -475,7 +474,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
{
INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]);
- sq->pending_tree = RB_ROOT;
+ sq->pending_tree = RB_ROOT_CACHED;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
@@ -616,31 +615,23 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
static struct throtl_grp *
throtl_rb_first(struct throtl_service_queue *parent_sq)
{
+ struct rb_node *n;
/* Service tree is empty */
if (!parent_sq->nr_pending)
return NULL;
- if (!parent_sq->first_pending)
- parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
-
- if (parent_sq->first_pending)
- return rb_entry_tg(parent_sq->first_pending);
-
- return NULL;
-}
-
-static void rb_erase_init(struct rb_node *n, struct rb_root *root)
-{
- rb_erase(n, root);
- RB_CLEAR_NODE(n);
+ n = rb_first_cached(&parent_sq->pending_tree);
+ WARN_ON_ONCE(!n);
+ if (!n)
+ return NULL;
+ return rb_entry_tg(n);
}
static void throtl_rb_erase(struct rb_node *n,
struct throtl_service_queue *parent_sq)
{
- if (parent_sq->first_pending == n)
- parent_sq->first_pending = NULL;
- rb_erase_init(n, &parent_sq->pending_tree);
+ rb_erase_cached(n, &parent_sq->pending_tree);
+ RB_CLEAR_NODE(n);
--parent_sq->nr_pending;
}
@@ -658,11 +649,11 @@ static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
static void tg_service_queue_add(struct throtl_grp *tg)
{
struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
- struct rb_node **node = &parent_sq->pending_tree.rb_node;
+ struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
struct rb_node *parent = NULL;
struct throtl_grp *__tg;
unsigned long key = tg->disptime;
- int left = 1;
+ bool leftmost = true;
while (*node != NULL) {
parent = *node;
@@ -672,15 +663,13 @@ static void tg_service_queue_add(struct throtl_grp *tg)
node = &parent->rb_left;
else {
node = &parent->rb_right;
- left = 0;
+ leftmost = false;
}
}
- if (left)
- parent_sq->first_pending = &tg->rb_node;
-
rb_link_node(&tg->rb_node, parent, node);
- rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
+ rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
+ leftmost);
}
static void __throtl_enqueue_tg(struct throtl_grp *tg)
@@ -2126,21 +2115,11 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
}
#endif
-static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
-{
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- /* fallback to root_blkg if we fail to get a blkg ref */
- if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
- bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-#endif
-}
-
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio)
{
struct throtl_qnode *qn = NULL;
- struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
+ struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
bool throttled = false;
@@ -2159,7 +2138,6 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
- blk_throtl_assoc_bio(tg, bio);
blk_throtl_update_idletime(tg);
sq = &tg->service_queue;
diff --git a/block/blk.h b/block/blk.h
index 9db4e389582c..58c030f727e9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@
#include <linux/idr.h>
#include <linux/blk-mq.h>
+#include <xen/xen.h>
#include "blk-mq.h"
/* Amount of time in which a process may batch requests */
@@ -149,6 +150,41 @@ static inline void blk_queue_enter_live(struct request_queue *q)
percpu_ref_get(&q->q_usage_counter);
}
+static inline bool biovec_phys_mergeable(struct request_queue *q,
+ struct bio_vec *vec1, struct bio_vec *vec2)
+{
+ unsigned long mask = queue_segment_boundary(q);
+ phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
+ phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
+
+ if (addr1 + vec1->bv_len != addr2)
+ return false;
+ if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2))
+ return false;
+ if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
+ return false;
+ return true;
+}
+
+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+ struct bio_vec *bprv, unsigned int offset)
+{
+ return offset ||
+ ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
+/*
+ * Check if adding a bio_vec after bprv with offset would create a gap in
+ * the SG list. Most drivers don't care about this, but some do.
+ */
+static inline bool bvec_gap_to_prev(struct request_queue *q,
+ struct bio_vec *bprv, unsigned int offset)
+{
+ if (!queue_virt_boundary(q))
+ return false;
+ return __bvec_gap_to_prev(q, bprv, offset);
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
@@ -158,7 +194,38 @@ static inline bool bio_integrity_endio(struct bio *bio)
return __bio_integrity_endio(bio);
return true;
}
-#else
+
+static inline bool integrity_req_gap_back_merge(struct request *req,
+ struct bio *next)
+{
+ struct bio_integrity_payload *bip = bio_integrity(req->bio);
+ struct bio_integrity_payload *bip_next = bio_integrity(next);
+
+ return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ bip_next->bip_vec[0].bv_offset);
+}
+
+static inline bool integrity_req_gap_front_merge(struct request *req,
+ struct bio *bio)
+{
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
+
+ return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ bip_next->bip_vec[0].bv_offset);
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline bool integrity_req_gap_back_merge(struct request *req,
+ struct bio *next)
+{
+ return false;
+}
+static inline bool integrity_req_gap_front_merge(struct request *req,
+ struct bio *bio)
+{
+ return false;
+}
+
static inline void blk_flush_integrity(void)
{
}
@@ -166,7 +233,7 @@ static inline bool bio_integrity_endio(struct bio *bio)
{
return true;
}
-#endif
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
void blk_timeout_work(struct work_struct *work);
unsigned long blk_rq_timeout(unsigned long timeout);
diff --git a/block/bounce.c b/block/bounce.c
index bc63b3a2d18c..b30071ac4ec6 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -257,7 +257,9 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
}
}
- bio_clone_blkcg_association(bio, bio_src);
+ bio_clone_blkg_association(bio, bio_src);
+
+ blkcg_bio_issue_init(bio);
return bio;
}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2eb87444b157..d219e9a1af65 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3753,7 +3753,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
uint64_t serial_nr;
rcu_read_lock();
- serial_nr = bio_blkcg(bio)->css.serial_nr;
+ serial_nr = __bio_blkcg(bio)->css.serial_nr;
rcu_read_unlock();
/*
@@ -3818,7 +3818,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
struct cfq_group *cfqg;
rcu_read_lock();
- cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+ cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio));
if (!cfqg) {
cfqq = &cfqd->oom_cfqq;
goto out;
diff --git a/block/elevator.c b/block/elevator.c
index fae58b2f906f..8fdcd64ae12e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -41,6 +41,7 @@
#include "blk.h"
#include "blk-mq-sched.h"
+#include "blk-pm.h"
#include "blk-wbt.h"
static DEFINE_SPINLOCK(elv_list_lock);
@@ -557,27 +558,6 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
}
-#ifdef CONFIG_PM
-static void blk_pm_requeue_request(struct request *rq)
-{
- if (rq->q->dev && !(rq->rq_flags & RQF_PM))
- rq->q->nr_pending--;
-}
-
-static void blk_pm_add_request(struct request_queue *q, struct request *rq)
-{
- if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
- (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
- pm_request_resume(q->dev);
-}
-#else
-static inline void blk_pm_requeue_request(struct request *rq) {}
-static inline void blk_pm_add_request(struct request_queue *q,
- struct request *rq)
-{
-}
-#endif
-
void elv_requeue_request(struct request_queue *q, struct request *rq)
{
/*
diff --git a/block/genhd.c b/block/genhd.c
index be5bab20b2ab..cff6bdf27226 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -567,7 +567,8 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
-static void register_disk(struct device *parent, struct gendisk *disk)
+static void register_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups)
{
struct device *ddev = disk_to_dev(disk);
struct block_device *bdev;
@@ -582,6 +583,10 @@ static void register_disk(struct device *parent, struct gendisk *disk)
/* delay uevents, until we scanned partition table */
dev_set_uevent_suppress(ddev, 1);
+ if (groups) {
+ WARN_ON(ddev->groups);
+ ddev->groups = groups;
+ }
if (device_add(ddev))
return;
if (!sysfs_deprecated) {
@@ -647,6 +652,7 @@ exit:
* __device_add_disk - add disk information to kernel list
* @parent: parent device for the disk
* @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
* @register_queue: register the queue if set to true
*
* This function registers the partitioning information in @disk
@@ -655,6 +661,7 @@ exit:
* FIXME: error handling
*/
static void __device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
bool register_queue)
{
dev_t devt;
@@ -698,7 +705,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
}
- register_disk(parent, disk);
+ register_disk(parent, disk, groups);
if (register_queue)
blk_register_queue(disk);
@@ -712,15 +719,17 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
blk_integrity_add(disk);
}
-void device_add_disk(struct device *parent, struct gendisk *disk)
+void device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups)
+
{
- __device_add_disk(parent, disk, true);
+ __device_add_disk(parent, disk, groups, true);
}
EXPORT_SYMBOL(device_add_disk);
void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
{
- __device_add_disk(parent, disk, false);
+ __device_add_disk(parent, disk, NULL, false);
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index a1660bafc912..eccac01a10b6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -29,19 +29,30 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
-#include "blk-stat.h"
-/* Scheduling domains. */
+#define CREATE_TRACE_POINTS
+#include <trace/events/kyber.h>
+
+/*
+ * Scheduling domains: the device is divided into multiple domains based on the
+ * request type.
+ */
enum {
KYBER_READ,
- KYBER_SYNC_WRITE,
- KYBER_OTHER, /* Async writes, discard, etc. */
+ KYBER_WRITE,
+ KYBER_DISCARD,
+ KYBER_OTHER,
KYBER_NUM_DOMAINS,
};
-enum {
- KYBER_MIN_DEPTH = 256,
+static const char *kyber_domain_names[] = {
+ [KYBER_READ] = "READ",
+ [KYBER_WRITE] = "WRITE",
+ [KYBER_DISCARD] = "DISCARD",
+ [KYBER_OTHER] = "OTHER",
+};
+enum {
/*
* In order to prevent starvation of synchronous requests by a flood of
* asynchronous requests, we reserve 25% of requests for synchronous
@@ -51,25 +62,87 @@ enum {
};
/*
- * Initial device-wide depths for each scheduling domain.
+ * Maximum device-wide depth for each scheduling domain.
*
- * Even for fast devices with lots of tags like NVMe, you can saturate
- * the device with only a fraction of the maximum possible queue depth.
- * So, we cap these to a reasonable value.
+ * Even for fast devices with lots of tags like NVMe, you can saturate the
+ * device with only a fraction of the maximum possible queue depth. So, we cap
+ * these to a reasonable value.
*/
static const unsigned int kyber_depth[] = {
[KYBER_READ] = 256,
- [KYBER_SYNC_WRITE] = 128,
- [KYBER_OTHER] = 64,
+ [KYBER_WRITE] = 128,
+ [KYBER_DISCARD] = 64,
+ [KYBER_OTHER] = 16,
};
/*
- * Scheduling domain batch sizes. We favor reads.
+ * Default latency targets for each scheduling domain.
+ */
+static const u64 kyber_latency_targets[] = {
+ [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
+ [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
+ [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
+};
+
+/*
+ * Batch size (number of requests we'll dispatch in a row) for each scheduling
+ * domain.
*/
static const unsigned int kyber_batch_size[] = {
[KYBER_READ] = 16,
- [KYBER_SYNC_WRITE] = 8,
- [KYBER_OTHER] = 8,
+ [KYBER_WRITE] = 8,
+ [KYBER_DISCARD] = 1,
+ [KYBER_OTHER] = 1,
+};
+
+/*
+ * Requests latencies are recorded in a histogram with buckets defined relative
+ * to the target latency:
+ *
+ * <= 1/4 * target latency
+ * <= 1/2 * target latency
+ * <= 3/4 * target latency
+ * <= target latency
+ * <= 1 1/4 * target latency
+ * <= 1 1/2 * target latency
+ * <= 1 3/4 * target latency
+ * > 1 3/4 * target latency
+ */
+enum {
+ /*
+ * The width of the latency histogram buckets is
+ * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
+ */
+ KYBER_LATENCY_SHIFT = 2,
+ /*
+ * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
+ * thus, "good".
+ */
+ KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
+ /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
+ KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
+};
+
+/*
+ * We measure both the total latency and the I/O latency (i.e., latency after
+ * submitting to the device).
+ */
+enum {
+ KYBER_TOTAL_LATENCY,
+ KYBER_IO_LATENCY,
+};
+
+static const char *kyber_latency_type_names[] = {
+ [KYBER_TOTAL_LATENCY] = "total",
+ [KYBER_IO_LATENCY] = "I/O",
+};
+
+/*
+ * Per-cpu latency histograms: total latency and I/O latency for each scheduling
+ * domain except for KYBER_OTHER.
+ */
+struct kyber_cpu_latency {
+ atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
};
/*
@@ -88,12 +161,9 @@ struct kyber_ctx_queue {
struct kyber_queue_data {
struct request_queue *q;
- struct blk_stat_callback *cb;
-
/*
- * The device is divided into multiple scheduling domains based on the
- * request type. Each domain has a fixed number of in-flight requests of
- * that type device-wide, limited by these tokens.
+ * Each scheduling domain has a limited number of in-flight requests
+ * device-wide, limited by these tokens.
*/
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
@@ -103,8 +173,19 @@ struct kyber_queue_data {
*/
unsigned int async_depth;
+ struct kyber_cpu_latency __percpu *cpu_latency;
+
+ /* Timer for stats aggregation and adjusting domain tokens. */
+ struct timer_list timer;
+
+ unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
+
+ unsigned long latency_timeout[KYBER_OTHER];
+
+ int domain_p99[KYBER_OTHER];
+
/* Target latencies in nanoseconds. */
- u64 read_lat_nsec, write_lat_nsec;
+ u64 latency_targets[KYBER_OTHER];
};
struct kyber_hctx_data {
@@ -124,233 +205,219 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
static unsigned int kyber_sched_domain(unsigned int op)
{
- if ((op & REQ_OP_MASK) == REQ_OP_READ)
+ switch (op & REQ_OP_MASK) {
+ case REQ_OP_READ:
return KYBER_READ;
- else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
- return KYBER_SYNC_WRITE;
- else
+ case REQ_OP_WRITE:
+ return KYBER_WRITE;
+ case REQ_OP_DISCARD:
+ return KYBER_DISCARD;
+ default:
return KYBER_OTHER;
+ }
}
-enum {
- NONE = 0,
- GOOD = 1,
- GREAT = 2,
- BAD = -1,
- AWFUL = -2,
-};
-
-#define IS_GOOD(status) ((status) > 0)
-#define IS_BAD(status) ((status) < 0)
-
-static int kyber_lat_status(struct blk_stat_callback *cb,
- unsigned int sched_domain, u64 target)
+static void flush_latency_buckets(struct kyber_queue_data *kqd,
+ struct kyber_cpu_latency *cpu_latency,
+ unsigned int sched_domain, unsigned int type)
{
- u64 latency;
-
- if (!cb->stat[sched_domain].nr_samples)
- return NONE;
+ unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+ atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
+ unsigned int bucket;
- latency = cb->stat[sched_domain].mean;
- if (latency >= 2 * target)
- return AWFUL;
- else if (latency > target)
- return BAD;
- else if (latency <= target / 2)
- return GREAT;
- else /* (latency <= target) */
- return GOOD;
+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+ buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
}
/*
- * Adjust the read or synchronous write depth given the status of reads and
- * writes. The goal is that the latencies of the two domains are fair (i.e., if
- * one is good, then the other is good).
+ * Calculate the histogram bucket with the given percentile rank, or -1 if there
+ * aren't enough samples yet.
*/
-static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
- unsigned int sched_domain, int this_status,
- int other_status)
+static int calculate_percentile(struct kyber_queue_data *kqd,
+ unsigned int sched_domain, unsigned int type,
+ unsigned int percentile)
{
- unsigned int orig_depth, depth;
+ unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+ unsigned int bucket, samples = 0, percentile_samples;
+
+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+ samples += buckets[bucket];
+
+ if (!samples)
+ return -1;
/*
- * If this domain had no samples, or reads and writes are both good or
- * both bad, don't adjust the depth.
+ * We do the calculation once we have 500 samples or one second passes
+ * since the first sample was recorded, whichever comes first.
*/
- if (this_status == NONE ||
- (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
- (IS_BAD(this_status) && IS_BAD(other_status)))
- return;
-
- orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
+ if (!kqd->latency_timeout[sched_domain])
+ kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
+ if (samples < 500 &&
+ time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
+ return -1;
+ }
+ kqd->latency_timeout[sched_domain] = 0;
- if (other_status == NONE) {
- depth++;
- } else {
- switch (this_status) {
- case GOOD:
- if (other_status == AWFUL)
- depth -= max(depth / 4, 1U);
- else
- depth -= max(depth / 8, 1U);
- break;
- case GREAT:
- if (other_status == AWFUL)
- depth /= 2;
- else
- depth -= max(depth / 4, 1U);
+ percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
+ for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
+ if (buckets[bucket] >= percentile_samples)
break;
- case BAD:
- depth++;
- break;
- case AWFUL:
- if (other_status == GREAT)
- depth += 2;
- else
- depth++;
- break;
- }
+ percentile_samples -= buckets[bucket];
}
+ memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
- depth = clamp(depth, 1U, kyber_depth[sched_domain]);
- if (depth != orig_depth)
- sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+ trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
+ kyber_latency_type_names[type], percentile,
+ bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
+
+ return bucket;
}
-/*
- * Adjust the depth of other requests given the status of reads and synchronous
- * writes. As long as either domain is doing fine, we don't throttle, but if
- * both domains are doing badly, we throttle heavily.
- */
-static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
- int read_status, int write_status,
- bool have_samples)
-{
- unsigned int orig_depth, depth;
- int status;
-
- orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
-
- if (read_status == NONE && write_status == NONE) {
- depth += 2;
- } else if (have_samples) {
- if (read_status == NONE)
- status = write_status;
- else if (write_status == NONE)
- status = read_status;
- else
- status = max(read_status, write_status);
- switch (status) {
- case GREAT:
- depth += 2;
- break;
- case GOOD:
- depth++;
- break;
- case BAD:
- depth -= max(depth / 4, 1U);
- break;
- case AWFUL:
- depth /= 2;
- break;
- }
+static void kyber_resize_domain(struct kyber_queue_data *kqd,
+ unsigned int sched_domain, unsigned int depth)
+{
+ depth = clamp(depth, 1U, kyber_depth[sched_domain]);
+ if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
+ sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+ trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
+ depth);
}
-
- depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
- if (depth != orig_depth)
- sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
}
-/*
- * Apply heuristics for limiting queue depths based on gathered latency
- * statistics.
- */
-static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
+static void kyber_timer_fn(struct timer_list *t)
{
- struct kyber_queue_data *kqd = cb->data;
- int read_status, write_status;
+ struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
+ unsigned int sched_domain;
+ int cpu;
+ bool bad = false;
+
+ /* Sum all of the per-cpu latency histograms. */
+ for_each_online_cpu(cpu) {
+ struct kyber_cpu_latency *cpu_latency;
+
+ cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+ flush_latency_buckets(kqd, cpu_latency, sched_domain,
+ KYBER_TOTAL_LATENCY);
+ flush_latency_buckets(kqd, cpu_latency, sched_domain,
+ KYBER_IO_LATENCY);
+ }
+ }
- read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
- write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
+ /*
+ * Check if any domains have a high I/O latency, which might indicate
+ * congestion in the device. Note that we use the p90; we don't want to
+ * be too sensitive to outliers here.
+ */
+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+ int p90;
- kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
- kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
- kyber_adjust_other_depth(kqd, read_status, write_status,
- cb->stat[KYBER_OTHER].nr_samples != 0);
+ p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
+ 90);
+ if (p90 >= KYBER_GOOD_BUCKETS)
+ bad = true;
+ }
/*
- * Continue monitoring latencies if we aren't hitting the targets or
- * we're still throttling other requests.
+ * Adjust the scheduling domain depths. If we determined that there was
+ * congestion, we throttle all domains with good latencies. Either way,
+ * we ease up on throttling domains with bad latencies.
*/
- if (!blk_stat_is_active(kqd->cb) &&
- ((IS_BAD(read_status) || IS_BAD(write_status) ||
- kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
- blk_stat_activate_msecs(kqd->cb, 100);
+ for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+ unsigned int orig_depth, depth;
+ int p99;
+
+ p99 = calculate_percentile(kqd, sched_domain,
+ KYBER_TOTAL_LATENCY, 99);
+ /*
+ * This is kind of subtle: different domains will not
+ * necessarily have enough samples to calculate the latency
+ * percentiles during the same window, so we have to remember
+ * the p99 for the next time we observe congestion; once we do,
+ * we don't want to throttle again until we get more data, so we
+ * reset it to -1.
+ */
+ if (bad) {
+ if (p99 < 0)
+ p99 = kqd->domain_p99[sched_domain];
+ kqd->domain_p99[sched_domain] = -1;
+ } else if (p99 >= 0) {
+ kqd->domain_p99[sched_domain] = p99;
+ }
+ if (p99 < 0)
+ continue;
+
+ /*
+ * If this domain has bad latency, throttle less. Otherwise,
+ * throttle more iff we determined that there is congestion.
+ *
+ * The new depth is scaled linearly with the p99 latency vs the
+ * latency target. E.g., if the p99 is 3/4 of the target, then
+ * we throttle down to 3/4 of the current depth, and if the p99
+ * is 2x the target, then we double the depth.
+ */
+ if (bad || p99 >= KYBER_GOOD_BUCKETS) {
+ orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
+ depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
+ kyber_resize_domain(kqd, sched_domain, depth);
+ }
+ }
}
-static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+static unsigned int kyber_sched_tags_shift(struct request_queue *q)
{
/*
* All of the hardware queues have the same depth, so we can just grab
* the shift of the first one.
*/
- return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
-}
-
-static int kyber_bucket_fn(const struct request *rq)
-{
- return kyber_sched_domain(rq->cmd_flags);
+ return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
}
static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
{
struct kyber_queue_data *kqd;
- unsigned int max_tokens;
unsigned int shift;
int ret = -ENOMEM;
int i;
- kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+ kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
if (!kqd)
goto err;
+
kqd->q = q;
- kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
- KYBER_NUM_DOMAINS, kqd);
- if (!kqd->cb)
+ kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!kqd->cpu_latency)
goto err_kqd;
- /*
- * The maximum number of tokens for any scheduling domain is at least
- * the queue depth of a single hardware queue. If the hardware doesn't
- * have many tags, still provide a reasonable number.
- */
- max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
- KYBER_MIN_DEPTH);
+ timer_setup(&kqd->timer, kyber_timer_fn, 0);
+
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
WARN_ON(!kyber_depth[i]);
WARN_ON(!kyber_batch_size[i]);
ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
- max_tokens, -1, false, GFP_KERNEL,
- q->node);
+ kyber_depth[i], -1, false,
+ GFP_KERNEL, q->node);
if (ret) {
while (--i >= 0)
sbitmap_queue_free(&kqd->domain_tokens[i]);
- goto err_cb;
+ goto err_buckets;
}
- sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
}
- shift = kyber_sched_tags_shift(kqd);
- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+ for (i = 0; i < KYBER_OTHER; i++) {
+ kqd->domain_p99[i] = -1;
+ kqd->latency_targets[i] = kyber_latency_targets[i];
+ }
- kqd->read_lat_nsec = 2000000ULL;
- kqd->write_lat_nsec = 10000000ULL;
+ shift = kyber_sched_tags_shift(q);
+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
return kqd;
-err_cb:
- blk_stat_free_callback(kqd->cb);
+err_buckets:
+ free_percpu(kqd->cpu_latency);
err_kqd:
kfree(kqd);
err:
@@ -372,25 +439,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
return PTR_ERR(kqd);
}
+ blk_stat_enable_accounting(q);
+
eq->elevator_data = kqd;
q->elevator = eq;
- blk_stat_add_callback(q, kqd->cb);
-
return 0;
}
static void kyber_exit_sched(struct elevator_queue *e)
{
struct kyber_queue_data *kqd = e->elevator_data;
- struct request_queue *q = kqd->q;
int i;
- blk_stat_remove_callback(q, kqd->cb);
+ del_timer_sync(&kqd->timer);
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
sbitmap_queue_free(&kqd->domain_tokens[i]);
- blk_stat_free_callback(kqd->cb);
+ free_percpu(kqd->cpu_latency);
kfree(kqd);
}
@@ -558,41 +624,44 @@ static void kyber_finish_request(struct request *rq)
rq_clear_domain_token(kqd, rq);
}
-static void kyber_completed_request(struct request *rq)
+static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
+ unsigned int sched_domain, unsigned int type,
+ u64 target, u64 latency)
{
- struct request_queue *q = rq->q;
- struct kyber_queue_data *kqd = q->elevator->elevator_data;
- unsigned int sched_domain;
- u64 now, latency, target;
+ unsigned int bucket;
+ u64 divisor;
- /*
- * Check if this request met our latency goal. If not, quickly gather
- * some statistics and start throttling.
- */
- sched_domain = kyber_sched_domain(rq->cmd_flags);
- switch (sched_domain) {
- case KYBER_READ:
- target = kqd->read_lat_nsec;
- break;
- case KYBER_SYNC_WRITE:
- target = kqd->write_lat_nsec;
- break;
- default:
- return;
+ if (latency > 0) {
+ divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
+ bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
+ KYBER_LATENCY_BUCKETS - 1);
+ } else {
+ bucket = 0;
}
- /* If we are already monitoring latencies, don't check again. */
- if (blk_stat_is_active(kqd->cb))
- return;
+ atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
+}
- now = ktime_get_ns();
- if (now < rq->io_start_time_ns)
+static void kyber_completed_request(struct request *rq, u64 now)
+{
+ struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
+ struct kyber_cpu_latency *cpu_latency;
+ unsigned int sched_domain;
+ u64 target;
+
+ sched_domain = kyber_sched_domain(rq->cmd_flags);
+ if (sched_domain == KYBER_OTHER)
return;
- latency = now - rq->io_start_time_ns;
+ cpu_latency = get_cpu_ptr(kqd->cpu_latency);
+ target = kqd->latency_targets[sched_domain];
+ add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
+ target, now - rq->start_time_ns);
+ add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
+ now - rq->io_start_time_ns);
+ put_cpu_ptr(kqd->cpu_latency);
- if (latency > target)
- blk_stat_activate_msecs(kqd->cb, 10);
+ timer_reduce(&kqd->timer, jiffies + HZ / 10);
}
struct flush_kcq_data {
@@ -713,6 +782,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
rq_set_domain_token(rq, nr);
list_del_init(&rq->queuelist);
return rq;
+ } else {
+ trace_kyber_throttled(kqd->q,
+ kyber_domain_names[khd->cur_domain]);
}
} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
nr = kyber_get_domain_token(kqd, khd, hctx);
@@ -723,6 +795,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
rq_set_domain_token(rq, nr);
list_del_init(&rq->queuelist);
return rq;
+ } else {
+ trace_kyber_throttled(kqd->q,
+ kyber_domain_names[khd->cur_domain]);
}
}
@@ -790,17 +865,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
return false;
}
-#define KYBER_LAT_SHOW_STORE(op) \
-static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
- char *page) \
+#define KYBER_LAT_SHOW_STORE(domain, name) \
+static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
+ char *page) \
{ \
struct kyber_queue_data *kqd = e->elevator_data; \
\
- return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
+ return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
} \
\
-static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
- const char *page, size_t count) \
+static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
+ const char *page, size_t count) \
{ \
struct kyber_queue_data *kqd = e->elevator_data; \
unsigned long long nsec; \
@@ -810,12 +885,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
if (ret) \
return ret; \
\
- kqd->op##_lat_nsec = nsec; \
+ kqd->latency_targets[domain] = nsec; \
\
return count; \
}
-KYBER_LAT_SHOW_STORE(read);
-KYBER_LAT_SHOW_STORE(write);
+KYBER_LAT_SHOW_STORE(KYBER_READ, read);
+KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
#undef KYBER_LAT_SHOW_STORE
#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
@@ -882,7 +957,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
return 0; \
}
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
-KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
#undef KYBER_DEBUGFS_DOMAIN_ATTRS
@@ -900,20 +976,7 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m)
struct blk_mq_hw_ctx *hctx = data;
struct kyber_hctx_data *khd = hctx->sched_data;
- switch (khd->cur_domain) {
- case KYBER_READ:
- seq_puts(m, "READ\n");
- break;
- case KYBER_SYNC_WRITE:
- seq_puts(m, "SYNC_WRITE\n");
- break;
- case KYBER_OTHER:
- seq_puts(m, "OTHER\n");
- break;
- default:
- seq_printf(m, "%u\n", khd->cur_domain);
- break;
- }
+ seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
return 0;
}
@@ -930,7 +993,8 @@ static int kyber_batching_show(void *data, struct seq_file *m)
{#name "_tokens", 0400, kyber_##name##_tokens_show}
static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
KYBER_QUEUE_DOMAIN_ATTRS(read),
- KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
+ KYBER_QUEUE_DOMAIN_ATTRS(write),
+ KYBER_QUEUE_DOMAIN_ATTRS(discard),
KYBER_QUEUE_DOMAIN_ATTRS(other),
{"async_depth", 0400, kyber_async_depth_show},
{},
@@ -942,7 +1006,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
{#name "_waiting", 0400, kyber_##name##_waiting_show}
static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
KYBER_HCTX_DOMAIN_ATTRS(read),
- KYBER_HCTX_DOMAIN_ATTRS(sync_write),
+ KYBER_HCTX_DOMAIN_ATTRS(write),
+ KYBER_HCTX_DOMAIN_ATTRS(discard),
KYBER_HCTX_DOMAIN_ATTRS(other),
{"cur_domain", 0400, kyber_cur_domain_show},
{"batching", 0400, kyber_batching_show},