summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-08 14:12:17 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-08 14:12:17 -0800
commit80201fe175cbf7f3e372f53eba0a881a702ad926 (patch)
tree8026c68d52763614268a9c3c80759ad386bd5967 /block
parent4221b807d1f73c03d22543416d303b60a5d1ef31 (diff)
parentaaeee62c841cc1e48231e1d60c304d2da9c4e41c (diff)
Merge tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe: "Not a huge amount of changes in this round, the biggest one is that we finally have Mings multi-page bvec support merged. Apart from that, this pull request contains: - Small series that avoids quiescing the queue for sysfs changes that match what we currently have (Aleksei) - Series of bcache fixes (via Coly) - Series of lightnvm fixes (via Mathias) - NVMe pull request from Christoph. Nothing major, just SPDX/license cleanups, RR mp policy (Hannes), and little fixes (Bart, Chaitanya). - BFQ series (Paolo) - Save blk-mq cpu -> hw queue mapping, removing a pointer indirection for the fast path (Jianchao) - fops->iopoll() added for async IO polling, this is a feature that the upcoming io_uring interface will use (Christoph, me) - Partition scan loop fixes (Dongli) - mtip32xx conversion from managed resource API (Christoph) - cdrom registration race fix (Guenter) - MD pull from Song, two minor fixes. - Various documentation fixes (Marcos) - Multi-page bvec feature. This brings a lot of nice improvements with it, like more efficient splitting, larger IOs can be supported without growing the bvec table size, and so on. (Ming) - Various little fixes to core and drivers" * tag 'for-5.1/block-20190302' of git://git.kernel.dk/linux-block: (117 commits) block: fix updating bio's front segment size block: Replace function name in string with __func__ nbd: propagate genlmsg_reply return code floppy: remove set but not used variable 'q' null_blk: fix checking for REQ_FUA block: fix NULL pointer dereference in register_disk fs: fix guard_bio_eod to check for real EOD errors blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map block: optimize bvec iteration in bvec_iter_advance block: introduce mp_bvec_for_each_page() for iterating over page block: optimize blk_bio_segment_split for single-page bvec block: optimize __blk_segment_map_sg() for single-page bvec block: introduce bvec_nth_page() iomap: wire up the iopoll method block: add bio_set_polled() helper block: wire up block device iopoll method fs: add an iopoll method to struct file_operations loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part() loop: do not print warn message if partition scan is successful block: bounce: make sure that bvec table is updated ...
Diffstat (limited to 'block')
-rw-r--r--block/bfq-iosched.c705
-rw-r--r--block/bfq-iosched.h11
-rw-r--r--block/bfq-wf2q.c18
-rw-r--r--block/bio.c49
-rw-r--r--block/blk-cgroup.c2
-rw-r--r--block/blk-merge.c231
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-sched.c2
-rw-r--r--block/blk-mq-tag.c2
-rw-r--r--block/blk-mq.c33
-rw-r--r--block/blk-mq.h20
-rw-r--r--block/blk-settings.c9
-rw-r--r--block/blk-sysfs.c22
-rw-r--r--block/blk.h2
-rw-r--r--block/bounce.c10
-rw-r--r--block/elevator.c5
-rw-r--r--block/genhd.c18
17 files changed, 658 insertions, 484 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cd307767a134..4c592496a16a 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -230,11 +230,16 @@ static struct kmem_cache *bfq_pool;
#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
/* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD 4
+#define BFQ_HW_QUEUE_THRESHOLD 3
#define BFQ_HW_QUEUE_SAMPLES 32
#define BFQQ_SEEK_THR (sector_t)(8 * 100)
#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
+ (get_sdist(last_pos, rq) > \
+ BFQQ_SEEK_THR && \
+ (!blk_queue_nonrot(bfqd->queue) || \
+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
@@ -624,26 +629,6 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
}
/*
- * Tell whether there are active queues with different weights or
- * active groups.
- */
-static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
-{
- /*
- * For queue weights to differ, queue_weights_tree must contain
- * at least two nodes.
- */
- return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
- (bfqd->queue_weights_tree.rb_node->rb_left ||
- bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- ) ||
- (bfqd->num_groups_with_pending_reqs > 0
-#endif
- );
-}
-
-/*
* The following function returns true if every queue must receive the
* same share of the throughput (this condition is used when deciding
* whether idling may be disabled, see the comments in the function
@@ -651,25 +636,48 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
*
* Such a scenario occurs when:
* 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- * weight,
+ * 2) all active queues belong to the same I/O-priority class,
* 3) all active groups at the same level in the groups tree have the same
+ * weight,
+ * 4) all active groups at the same level in the groups tree have the same
* number of children.
*
* Unfortunately, keeping the necessary state for evaluating exactly
* the last two symmetry sub-conditions above would be quite complex
- * and time consuming. Therefore this function evaluates, instead,
- * only the following stronger two sub-conditions, for which it is
+ * and time consuming. Therefore this function evaluates, instead,
+ * only the following stronger three sub-conditions, for which it is
* much easier to maintain the needed state:
* 1) all active queues have the same weight,
- * 2) there are no active groups.
+ * 2) all active queues belong to the same I/O-priority class,
+ * 3) there are no active groups.
* In particular, the last condition is always true if hierarchical
* support or the cgroups interface are not enabled, thus no state
* needs to be maintained in this case.
*/
static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
{
- return !bfq_varied_queue_weights_or_active_groups(bfqd);
+ /*
+ * For queue weights to differ, queue_weights_tree must contain
+ * at least two nodes.
+ */
+ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+ (bfqd->queue_weights_tree.rb_node->rb_left ||
+ bfqd->queue_weights_tree.rb_node->rb_right);
+
+ bool multiple_classes_busy =
+ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
+ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
+ (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
+
+ /*
+ * For queue weights to differ, queue_weights_tree must contain
+ * at least two nodes.
+ */
+ return !(varied_queue_weights || multiple_classes_busy
+#ifdef BFQ_GROUP_IOSCHED_ENABLED
+ || bfqd->num_groups_with_pending_reqs > 0
+#endif
+ );
}
/*
@@ -728,15 +736,14 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/*
* In the unlucky event of an allocation failure, we just
* exit. This will cause the weight of queue to not be
- * considered in bfq_varied_queue_weights_or_active_groups,
- * which, in its turn, causes the scenario to be deemed
- * wrongly symmetric in case bfqq's weight would have been
- * the only weight making the scenario asymmetric. On the
- * bright side, no unbalance will however occur when bfqq
- * becomes inactive again (the invocation of this function
- * is triggered by an activation of queue). In fact,
- * bfq_weights_tree_remove does nothing if
- * !bfqq->weight_counter.
+ * considered in bfq_symmetric_scenario, which, in its turn,
+ * causes the scenario to be deemed wrongly symmetric in case
+ * bfqq's weight would have been the only weight making the
+ * scenario asymmetric. On the bright side, no unbalance will
+ * however occur when bfqq becomes inactive again (the
+ * invocation of this function is triggered by an activation
+ * of queue). In fact, bfq_weights_tree_remove does nothing
+ * if !bfqq->weight_counter.
*/
if (unlikely(!bfqq->weight_counter))
return;
@@ -747,6 +754,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
inc_counter:
bfqq->weight_counter->num_active++;
+ bfqq->ref++;
}
/*
@@ -771,6 +779,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
reset_entity_pointer:
bfqq->weight_counter = NULL;
+ bfq_put_queue(bfqq);
}
/*
@@ -782,9 +791,6 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
{
struct bfq_entity *entity = bfqq->entity.parent;
- __bfq_weights_tree_remove(bfqd, bfqq,
- &bfqd->queue_weights_tree);
-
for_each_entity(entity) {
struct bfq_sched_data *sd = entity->my_sched_data;
@@ -818,6 +824,15 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
bfqd->num_groups_with_pending_reqs--;
}
}
+
+ /*
+ * Next function is invoked last, because it causes bfqq to be
+ * freed if the following holds: bfqq is not in service and
+ * has no dispatched request. DO NOT use bfqq after the next
+ * function invocation.
+ */
+ __bfq_weights_tree_remove(bfqd, bfqq,
+ &bfqd->queue_weights_tree);
}
/*
@@ -873,7 +888,8 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
static unsigned long bfq_serv_to_charge(struct request *rq,
struct bfq_queue *bfqq)
{
- if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
+ !bfq_symmetric_scenario(bfqq->bfqd))
return blk_rq_sectors(rq);
return blk_rq_sectors(rq) * bfq_async_charge_factor;
@@ -907,8 +923,10 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
*/
return;
- new_budget = max_t(unsigned long, bfqq->max_budget,
- bfq_serv_to_charge(next_rq, bfqq));
+ new_budget = max_t(unsigned long,
+ max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq)),
+ entity->service);
if (entity->budget != new_budget) {
entity->budget = new_budget;
bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
@@ -1011,7 +1029,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
+ (bfqq->weight_counter != NULL);
}
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -1380,7 +1399,15 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
{
struct bfq_entity *entity = &bfqq->entity;
- if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+ /*
+ * In the next compound condition, we check also whether there
+ * is some budget left, because otherwise there is no point in
+ * trying to go on serving bfqq with this same budget: bfqq
+ * would be expired immediately after being selected for
+ * service. This would only cause useless overhead.
+ */
+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
+ bfq_bfqq_budget_left(bfqq) > 0) {
/*
* We do not clear the flag non_blocking_wait_rq here, as
* the latter is used in bfq_activate_bfqq to signal
@@ -2217,14 +2244,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
return NULL;
/* If there is only one backlogged queue, don't search. */
- if (bfqd->busy_queues == 1)
+ if (bfq_tot_busy_queues(bfqd) == 1)
return NULL;
in_service_bfqq = bfqd->in_service_queue;
if (in_service_bfqq && in_service_bfqq != bfqq &&
likely(in_service_bfqq != &bfqd->oom_bfqq) &&
- bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+ bfq_rq_close_to_sector(io_struct, request,
+ bfqd->in_serv_last_pos) &&
bfqq->entity.parent == in_service_bfqq->entity.parent &&
bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@@ -2742,7 +2770,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
if ((bfqd->rq_in_driver > 0 ||
now_ns - bfqd->last_completion < BFQ_MIN_TT)
- && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
bfqd->sequential_samples++;
bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
@@ -2764,6 +2792,8 @@ update_rate_and_reset:
bfq_update_rate_reset(bfqd, rq);
update_last_values:
bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+ if (RQ_BFQQ(rq) == bfqd->in_service_queue)
+ bfqd->in_serv_last_pos = bfqd->last_position;
bfqd->last_dispatch = now_ns;
}
@@ -3274,16 +3304,32 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* requests, then the request pattern is isochronous
* (see the comments on the function
* bfq_bfqq_softrt_next_start()). Thus we can compute
- * soft_rt_next_start. If, instead, the queue still
- * has outstanding requests, then we have to wait for
- * the completion of all the outstanding requests to
- * discover whether the request pattern is actually
- * isochronous.
+ * soft_rt_next_start. And we do it, unless bfqq is in
+ * interactive weight raising. We do not do it in the
+ * latter subcase, for the following reason. bfqq may
+ * be conveying the I/O needed to load a soft
+ * real-time application. Such an application will
+ * actually exhibit a soft real-time I/O pattern after
+ * it finally starts doing its job. But, if
+ * soft_rt_next_start is computed here for an
+ * interactive bfqq, and bfqq had received a lot of
+ * service before remaining with no outstanding
+ * request (likely to happen on a fast device), then
+ * soft_rt_next_start would be assigned such a high
+ * value that, for a very long time, bfqq would be
+ * prevented from being possibly considered as soft
+ * real time.
+ *
+ * If, instead, the queue still has outstanding
+ * requests, then we have to wait for the completion
+ * of all the outstanding requests to discover whether
+ * the request pattern is actually isochronous.
*/
- if (bfqq->dispatched == 0)
+ if (bfqq->dispatched == 0 &&
+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq);
- else {
+ else if (bfqq->dispatched > 0) {
/*
* Schedule an update of soft_rt_next_start to when
* the task may be discovered to be isochronous.
@@ -3376,53 +3422,13 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
bfq_bfqq_budget_timeout(bfqq);
}
-/*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for the queue. As a consequence, since
- * device idling plays a critical role in both throughput boosting and
- * service guarantees, the return value of this function plays a
- * critical role in both these aspects as well.
- *
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
- *
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * individually while introducing the variables.
- */
-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
{
- struct bfq_data *bfqd = bfqq->bfqd;
bool rot_without_queueing =
!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
bfqq_sequential_and_IO_bound,
- idling_boosts_thr, idling_boosts_thr_without_issues,
- idling_needed_for_service_guarantees,
- asymmetric_scenario;
-
- if (bfqd->strict_guarantees)
- return true;
-
- /*
- * Idling is performed only if slice_idle > 0. In addition, we
- * do not idle if
- * (a) bfqq is async
- * (b) bfqq is in the idle io prio class: in this case we do
- * not idle because we want to minimize the bandwidth that
- * queues in this class can steal to higher-priority queues
- */
- if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
- bfq_class_idle(bfqq))
- return false;
+ idling_boosts_thr;
bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
@@ -3454,8 +3460,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
bfqq_sequential_and_IO_bound);
/*
- * The value of the next variable,
- * idling_boosts_thr_without_issues, is equal to that of
+ * The return value of this function is equal to that of
* idling_boosts_thr, unless a special case holds. In this
* special case, described below, idling may cause problems to
* weight-raised queues.
@@ -3472,217 +3477,252 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
* which enqueue several requests in advance, and further
* reorder internally-queued requests.
*
- * For this reason, we force to false the value of
- * idling_boosts_thr_without_issues if there are weight-raised
- * busy queues. In this case, and if bfqq is not weight-raised,
- * this guarantees that the device is not idled for bfqq (if,
- * instead, bfqq is weight-raised, then idling will be
- * guaranteed by another variable, see below). Combined with
- * the timestamping rules of BFQ (see [1] for details), this
- * behavior causes bfqq, and hence any sync non-weight-raised
- * queue, to get a lower number of requests served, and thus
- * to ask for a lower number of requests from the request
- * pool, before the busy weight-raised queues get served
- * again. This often mitigates starvation problems in the
- * presence of heavy write workloads and NCQ, thereby
- * guaranteeing a higher application and system responsiveness
- * in these hostile scenarios.
+ * For this reason, we force to false the return value if
+ * there are weight-raised busy queues. In this case, and if
+ * bfqq is not weight-raised, this guarantees that the device
+ * is not idled for bfqq (if, instead, bfqq is weight-raised,
+ * then idling will be guaranteed by another variable, see
+ * below). Combined with the timestamping rules of BFQ (see
+ * [1] for details), this behavior causes bfqq, and hence any
+ * sync non-weight-raised queue, to get a lower number of
+ * requests served, and thus to ask for a lower number of
+ * requests from the request pool, before the busy
+ * weight-raised queues get served again. This often mitigates
+ * starvation problems in the presence of heavy write
+ * workloads and NCQ, thereby guaranteeing a higher
+ * application and system responsiveness in these hostile
+ * scenarios.
*/
- idling_boosts_thr_without_issues = idling_boosts_thr &&
+ return idling_boosts_thr &&
bfqd->wr_busy_queues == 0;
+}
- /*
- * There is then a case where idling must be performed not
- * for throughput concerns, but to preserve service
- * guarantees.
- *
- * To introduce this case, we can note that allowing the drive
- * to enqueue more than one request at a time, and hence
- * delegating de facto final scheduling decisions to the
- * drive's internal scheduler, entails loss of control on the
- * actual request service order. In particular, the critical
- * situation is when requests from different processes happen
- * to be present, at the same time, in the internal queue(s)
- * of the drive. In such a situation, the drive, by deciding
- * the service order of the internally-queued requests, does
- * determine also the actual throughput distribution among
- * these processes. But the drive typically has no notion or
- * concern about per-process throughput distribution, and
- * makes its decisions only on a per-request basis. Therefore,
- * the service distribution enforced by the drive's internal
- * scheduler is likely to coincide with the desired
- * device-throughput distribution only in a completely
- * symmetric scenario where:
- * (i) each of these processes must get the same throughput as
- * the others;
- * (ii) the I/O of each process has the same properties, in
- * terms of locality (sequential or random), direction
- * (reads or writes), request sizes, greediness
- * (from I/O-bound to sporadic), and so on.
- * In fact, in such a scenario, the drive tends to treat
- * the requests of each of these processes in about the same
- * way as the requests of the others, and thus to provide
- * each of these processes with about the same throughput
- * (which is exactly the desired throughput distribution). In
- * contrast, in any asymmetric scenario, device idling is
- * certainly needed to guarantee that bfqq receives its
- * assigned fraction of the device throughput (see [1] for
- * details).
- * The problem is that idling may significantly reduce
- * throughput with certain combinations of types of I/O and
- * devices. An important example is sync random I/O, on flash
- * storage with command queueing. So, unless bfqq falls in the
- * above cases where idling also boosts throughput, it would
- * be important to check conditions (i) and (ii) accurately,
- * so as to avoid idling when not strictly needed for service
- * guarantees.
- *
- * Unfortunately, it is extremely difficult to thoroughly
- * check condition (ii). And, in case there are active groups,
- * it becomes very difficult to check condition (i) too. In
- * fact, if there are active groups, then, for condition (i)
- * to become false, it is enough that an active group contains
- * more active processes or sub-groups than some other active
- * group. More precisely, for condition (i) to hold because of
- * such a group, it is not even necessary that the group is
- * (still) active: it is sufficient that, even if the group
- * has become inactive, some of its descendant processes still
- * have some request already dispatched but still waiting for
- * completion. In fact, requests have still to be guaranteed
- * their share of the throughput even after being
- * dispatched. In this respect, it is easy to show that, if a
- * group frequently becomes inactive while still having
- * in-flight requests, and if, when this happens, the group is
- * not considered in the calculation of whether the scenario
- * is asymmetric, then the group may fail to be guaranteed its
- * fair share of the throughput (basically because idling may
- * not be performed for the descendant processes of the group,
- * but it had to be). We address this issue with the
- * following bi-modal behavior, implemented in the function
- * bfq_symmetric_scenario().
- *
- * If there are groups with requests waiting for completion
- * (as commented above, some of these groups may even be
- * already inactive), then the scenario is tagged as
- * asymmetric, conservatively, without checking any of the
- * conditions (i) and (ii). So the device is idled for bfqq.
- * This behavior matches also the fact that groups are created
- * exactly if controlling I/O is a primary concern (to
- * preserve bandwidth and latency guarantees).
- *
- * On the opposite end, if there are no groups with requests
- * waiting for completion, then only condition (i) is actually
- * controlled, i.e., provided that condition (i) holds, idling
- * is not performed, regardless of whether condition (ii)
- * holds. In other words, only if condition (i) does not hold,
- * then idling is allowed, and the device tends to be
- * prevented from queueing many requests, possibly of several
- * processes. Since there are no groups with requests waiting
- * for completion, then, to control condition (i) it is enough
- * to check just whether all the queues with requests waiting
- * for completion also have the same weight.
- *
- * Not checking condition (ii) evidently exposes bfqq to the
- * risk of getting less throughput than its fair share.
- * However, for queues with the same weight, a further
- * mechanism, preemption, mitigates or even eliminates this
- * problem. And it does so without consequences on overall
- * throughput. This mechanism and its benefits are explained
- * in the next three paragraphs.
- *
- * Even if a queue, say Q, is expired when it remains idle, Q
- * can still preempt the new in-service queue if the next
- * request of Q arrives soon (see the comments on
- * bfq_bfqq_update_budg_for_activation). If all queues and
- * groups have the same weight, this form of preemption,
- * combined with the hole-recovery heuristic described in the
- * comments on function bfq_bfqq_update_budg_for_activation,
- * are enough to preserve a correct bandwidth distribution in
- * the mid term, even without idling. In fact, even if not
- * idling allows the internal queues of the device to contain
- * many requests, and thus to reorder requests, we can rather
- * safely assume that the internal scheduler still preserves a
- * minimum of mid-term fairness.
- *
- * More precisely, this preemption-based, idleless approach
- * provides fairness in terms of IOPS, and not sectors per
- * second. This can be seen with a simple example. Suppose
- * that there are two queues with the same weight, but that
- * the first queue receives requests of 8 sectors, while the
- * second queue receives requests of 1024 sectors. In
- * addition, suppose that each of the two queues contains at
- * most one request at a time, which implies that each queue
- * always remains idle after it is served. Finally, after
- * remaining idle, each queue receives very quickly a new
- * request. It follows that the two queues are served
- * alternatively, preempting each other if needed. This
- * implies that, although both queues have the same weight,
- * the queue with large requests receives a service that is
- * 1024/8 times as high as the service received by the other
- * queue.
- *
- * The motivation for using preemption instead of idling (for
- * queues with the same weight) is that, by not idling,
- * service guarantees are preserved (completely or at least in
- * part) without minimally sacrificing throughput. And, if
- * there is no active group, then the primary expectation for
- * this device is probably a high throughput.
- *
- * We are now left only with explaining the additional
- * compound condition that is checked below for deciding
- * whether the scenario is asymmetric. To explain this
- * compound condition, we need to add that the function
- * bfq_symmetric_scenario checks the weights of only
- * non-weight-raised queues, for efficiency reasons (see
- * comments on bfq_weights_tree_add()). Then the fact that
- * bfqq is weight-raised is checked explicitly here. More
- * precisely, the compound condition below takes into account
- * also the fact that, even if bfqq is being weight-raised,
- * the scenario is still symmetric if all queues with requests
- * waiting for completion happen to be
- * weight-raised. Actually, we should be even more precise
- * here, and differentiate between interactive weight raising
- * and soft real-time weight raising.
- *
- * As a side note, it is worth considering that the above
- * device-idling countermeasures may however fail in the
- * following unlucky scenario: if idling is (correctly)
- * disabled in a time period during which all symmetry
- * sub-conditions hold, and hence the device is allowed to
- * enqueue many requests, but at some later point in time some
- * sub-condition stops to hold, then it may become impossible
- * to let requests be served in the desired order until all
- * the requests already queued in the device have been served.
- */
- asymmetric_scenario = (bfqq->wr_coeff > 1 &&
- bfqd->wr_busy_queues < bfqd->busy_queues) ||
+/*
+ * There is a case where idling must be performed not for
+ * throughput concerns, but to preserve service guarantees.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired
+ * device-throughput distribution only in a completely
+ * symmetric scenario where:
+ * (i) each of these processes must get the same throughput as
+ * the others;
+ * (ii) the I/O of each process has the same properties, in
+ * terms of locality (sequential or random), direction
+ * (reads or writes), request sizes, greediness
+ * (from I/O-bound to sporadic), and so on.
+ * In fact, in such a scenario, the drive tends to treat
+ * the requests of each of these processes in about the same
+ * way as the requests of the others, and thus to provide
+ * each of these processes with about the same throughput
+ * (which is exactly the desired throughput distribution). In
+ * contrast, in any asymmetric scenario, device idling is
+ * certainly needed to guarantee that bfqq receives its
+ * assigned fraction of the device throughput (see [1] for
+ * details).
+ * The problem is that idling may significantly reduce
+ * throughput with certain combinations of types of I/O and
+ * devices. An important example is sync random I/O, on flash
+ * storage with command queueing. So, unless bfqq falls in the
+ * above cases where idling also boosts throughput, it would
+ * be important to check conditions (i) and (ii) accurately,
+ * so as to avoid idling when not strictly needed for service
+ * guarantees.
+ *
+ * Unfortunately, it is extremely difficult to thoroughly
+ * check condition (ii). And, in case there are active groups,
+ * it becomes very difficult to check condition (i) too. In
+ * fact, if there are active groups, then, for condition (i)
+ * to become false, it is enough that an active group contains
+ * more active processes or sub-groups than some other active
+ * group. More precisely, for condition (i) to hold because of
+ * such a group, it is not even necessary that the group is
+ * (still) active: it is sufficient that, even if the group
+ * has become inactive, some of its descendant processes still
+ * have some request already dispatched but still waiting for
+ * completion. In fact, requests have still to be guaranteed
+ * their share of the throughput even after being
+ * dispatched. In this respect, it is easy to show that, if a
+ * group frequently becomes inactive while still having
+ * in-flight requests, and if, when this happens, the group is
+ * not considered in the calculation of whether the scenario
+ * is asymmetric, then the group may fail to be guaranteed its
+ * fair share of the throughput (basically because idling may
+ * not be performed for the descendant processes of the group,
+ * but it had to be). We address this issue with the
+ * following bi-modal behavior, implemented in the function
+ * bfq_symmetric_scenario().
+ *
+ * If there are groups with requests waiting for completion
+ * (as commented above, some of these groups may even be
+ * already inactive), then the scenario is tagged as
+ * asymmetric, conservatively, without checking any of the
+ * conditions (i) and (ii). So the device is idled for bfqq.
+ * This behavior matches also the fact that groups are created
+ * exactly if controlling I/O is a primary concern (to
+ * preserve bandwidth and latency guarantees).
+ *
+ * On the opposite end, if there are no groups with requests
+ * waiting for completion, then only condition (i) is actually
+ * controlled, i.e., provided that condition (i) holds, idling
+ * is not performed, regardless of whether condition (ii)
+ * holds. In other words, only if condition (i) does not hold,
+ * then idling is allowed, and the device tends to be
+ * prevented from queueing many requests, possibly of several
+ * processes. Since there are no groups with requests waiting
+ * for completion, then, to control condition (i) it is enough
+ * to check just whether all the queues with requests waiting
+ * for completion also have the same weight.
+ *
+ * Not checking condition (ii) evidently exposes bfqq to the
+ * risk of getting less throughput than its fair share.
+ * However, for queues with the same weight, a further
+ * mechanism, preemption, mitigates or even eliminates this
+ * problem. And it does so without consequences on overall
+ * throughput. This mechanism and its benefits are explained
+ * in the next three paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * The motivation for using preemption instead of idling (for
+ * queues with the same weight) is that, by not idling,
+ * service guarantees are preserved (completely or at least in
+ * part) without minimally sacrificing throughput. And, if
+ * there is no active group, then the primary expectation for
+ * this device is probably a high throughput.
+ *
+ * We are now left only with explaining the additional
+ * compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain this
+ * compound condition, we need to add that the function
+ * bfq_symmetric_scenario checks the weights of only
+ * non-weight-raised queues, for efficiency reasons (see
+ * comments on bfq_weights_tree_add()). Then the fact that
+ * bfqq is weight-raised is checked explicitly here. More
+ * precisely, the compound condition below takes into account
+ * also the fact that, even if bfqq is being weight-raised,
+ * the scenario is still symmetric if all queues with requests
+ * waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise
+ * here, and differentiate between interactive weight raising
+ * and soft real-time weight raising.
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the
+ * following unlucky scenario: if idling is (correctly)
+ * disabled in a time period during which all symmetry
+ * sub-conditions hold, and hence the device is allowed to
+ * enqueue many requests, but at some later point in time some
+ * sub-condition stops to hold, then it may become impossible
+ * to let requests be served in the desired order until all
+ * the requests already queued in the device have been served.
+ */
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return (bfqq->wr_coeff > 1 &&
+ bfqd->wr_busy_queues <
+ bfq_tot_busy_queues(bfqd)) ||
!bfq_symmetric_scenario(bfqd);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * Most of the issues taken into account to get the return value of
+ * this function are not trivial. We discuss these issues in the two
+ * functions providing the main pieces of information needed by this
+ * function.
+ */
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
+
+ if (unlikely(bfqd->strict_guarantees))
+ return true;
/*
- * Finally, there is a case where maximizing throughput is the
- * best choice even if it may cause unfairness toward
- * bfqq. Such a case is when bfqq became active in a burst of
- * queue activations. Queues that became active during a large
- * burst benefit only from throughput, as discussed in the
- * comments on bfq_handle_burst. Thus, if bfqq became active
- * in a burst and not idling the device maximizes throughput,
- * then the device must no be idled, because not idling the
- * device provides bfqq and all other queues in the burst with
- * maximum benefit. Combining this and the above case, we can
- * now establish when idling is actually needed to preserve
- * service guarantees.
+ * Idling is performed only if slice_idle > 0. In addition, we
+ * do not idle if
+ * (a) bfqq is async
+ * (b) bfqq is in the idle io prio class: in this case we do
+ * not idle because we want to minimize the bandwidth that
+ * queues in this class can steal to higher-priority queues
*/
- idling_needed_for_service_guarantees =
- asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
+ bfq_class_idle(bfqq))
+ return false;
+
+ idling_boosts_thr_with_no_issue =
+ idling_boosts_thr_without_issues(bfqd, bfqq);
+
+ idling_needed_for_service_guar =
+ idling_needed_for_service_guarantees(bfqd, bfqq);
/*
- * We have now all the components we need to compute the
+ * We have now the two components we need to compute the
* return value of the function, which is true only if idling
* either boosts the throughput (without issues), or is
* necessary to preserve service guarantees.
*/
- return idling_boosts_thr_without_issues ||
- idling_needed_for_service_guarantees;
+ return idling_boosts_thr_with_no_issue ||
+ idling_needed_for_service_guar;
}
/*
@@ -3934,7 +3974,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
* belongs to CLASS_IDLE and other queues are waiting for
* service.
*/
- if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
+ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
goto return_rq;
bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
@@ -3952,7 +3992,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
* most a call to dispatch for nothing
*/
return !list_empty_careful(&bfqd->dispatch) ||
- bfqd->busy_queues > 0;
+ bfq_tot_busy_queues(bfqd) > 0;
}
static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
@@ -4006,9 +4046,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
goto start_rq;
}
- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+ bfq_log(bfqd, "dispatch requests: %d busy queues",
+ bfq_tot_busy_queues(bfqd));
- if (bfqd->busy_queues == 0)
+ if (bfq_tot_busy_queues(bfqd) == 0)
goto exit;
/*
@@ -4488,10 +4529,7 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct request *rq)
{
bfqq->seek_history <<= 1;
- bfqq->seek_history |=
- get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
- (!blk_queue_nonrot(bfqd->queue) ||
- blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
}
static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -4560,28 +4598,31 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
/*
- * There is just this request queued: if the request
- * is small and the queue is not to be expired, then
- * just exit.
+ * There is just this request queued: if
+ * - the request is small, and
+ * - we are idling to boost throughput, and
+ * - the queue is not to be expired,
+ * then just exit.
*
* In this way, if the device is being idled to wait
* for a new request from the in-service queue, we
* avoid unplugging the device and committing the
- * device to serve just a small request. On the
- * contrary, we wait for the block layer to decide
- * when to unplug the device: hopefully, new requests
- * will be merged to this one quickly, then the device
- * will be unplugged and larger requests will be
- * dispatched.
+ * device to serve just a small request. In contrast
+ * we wait for the block layer to decide when to
+ * unplug the device: hopefully, new requests will be
+ * merged to this one quickly, then the device will be
+ * unplugged and larger requests will be dispatched.
*/
- if (small_req && !budget_timeout)
+ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
+ !budget_timeout)
return;
/*
- * A large enough request arrived, or the queue is to
- * be expired: in both cases disk idling is to be
- * stopped, so clear wait_request flag and reset
- * timer.
+ * A large enough request arrived, or idling is being
+ * performed to preserve service guarantees, or
+ * finally the queue is to be expired: in all these
+ * cases disk idling is to be stopped, so clear
+ * wait_request flag and reset timer.
*/
bfq_clear_bfqq_wait_request(bfqq);
hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
@@ -4607,8 +4648,6 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
- if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
- new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
/*
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
@@ -4751,6 +4790,8 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
static void bfq_update_hw_tag(struct bfq_data *bfqd)
{
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
bfqd->rq_in_driver);
@@ -4763,7 +4804,18 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
* sum is not exact, as it's not taking into account deactivated
* requests.
*/
- if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
+ return;
+
+ /*
+ * If active queue hasn't enough requests and can idle, bfq might not
+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this
+ * case
+ */
+ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
+ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
+ BFQ_HW_QUEUE_THRESHOLD &&
+ bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
return;
if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@@ -4834,11 +4886,14 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* isochronous, and both requisites for this condition to hold
* are now satisfied, then compute soft_rt_next_start (see the
* comments on the function bfq_bfqq_softrt_next_start()). We
- * schedule this delayed check when bfqq expires, if it still
- * has in-flight requests.
+ * do not compute soft_rt_next_start if bfqq is in interactive
+ * weight raising (see the comments in bfq_bfqq_expire() for
+ * an explanation). We schedule this delayed update when bfqq
+ * expires, if it still has in-flight requests.
*/
if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
- RB_EMPTY_ROOT(&bfqq->sort_list))
+ RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ bfqq->wr_coeff != bfqd->bfq_wr_coeff)
bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 0b02bf302de0..062e1c4787f4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -501,10 +501,11 @@ struct bfq_data {
unsigned int num_groups_with_pending_reqs;
/*
- * Number of bfq_queues containing requests (including the
- * queue in service, even if it is idling).
+ * Per-class (RT, BE, IDLE) number of bfq_queues containing
+ * requests (including the queue in service, even if it is
+ * idling).
*/
- int busy_queues;
+ unsigned int busy_queues[3];
/* number of weight-raised busy @bfq_queues */
int wr_busy_queues;
/* number of queued requests */
@@ -537,6 +538,9 @@ struct bfq_data {
/* on-disk position of the last served request */
sector_t last_position;
+ /* position of the last served request for the in-service queue */
+ sector_t in_serv_last_pos;
+
/* time of last request completion (ns) */
u64 last_completion;
@@ -974,6 +978,7 @@ extern struct blkcg_policy blkcg_policy_bfq;
struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
struct bfq_entity *bfq_entity_of(struct rb_node *node);
unsigned short bfq_ioprio_to_weight(int ioprio);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 72adbbe975d5..63311d1ff1ed 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -44,6 +44,12 @@ static unsigned int bfq_class_idx(struct bfq_entity *entity)
BFQ_DEFAULT_GRP_CLASS - 1;
}
+unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd)
+{
+ return bfqd->busy_queues[0] + bfqd->busy_queues[1] +
+ bfqd->busy_queues[2];
+}
+
static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
bool expiration);
@@ -1513,7 +1519,7 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
struct bfq_sched_data *sd;
struct bfq_queue *bfqq;
- if (bfqd->busy_queues == 0)
+ if (bfq_tot_busy_queues(bfqd) == 0)
return NULL;
/*
@@ -1665,10 +1671,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_clear_bfqq_busy(bfqq);
- bfqd->busy_queues--;
-
- if (!bfqq->dispatched)
- bfq_weights_tree_remove(bfqd, bfqq);
+ bfqd->busy_queues[bfqq->ioprio_class - 1]--;
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--;
@@ -1676,6 +1679,9 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqg_stats_update_dequeue(bfqq_group(bfqq));
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+
+ if (!bfqq->dispatched)
+ bfq_weights_tree_remove(bfqd, bfqq);
}
/*
@@ -1688,7 +1694,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_activate_bfqq(bfqd, bfqq);
bfq_mark_bfqq_busy(bfqq);
- bfqd->busy_queues++;
+ bfqd->busy_queues[bfqq->ioprio_class - 1]++;
if (!bfqq->dispatched)
if (bfqq->wr_coeff == 1)
diff --git a/block/bio.c b/block/bio.c
index 4db1008309ed..83a2dfa417ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
* @page: page to add
* @len: length of the data to add
* @off: offset of the data in @page
+ * @same_page: if %true only merge if the new data is in the same physical
+ * page as the last segment of the bio.
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* a useful optimisation for file systems with a block size smaller than the
@@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int off)
+ unsigned int len, unsigned int off, bool same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
+ bv->bv_offset + bv->bv_len - 1;
+ phys_addr_t page_addr = page_to_phys(page);
- if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
- bv->bv_len += len;
- bio->bi_iter.bi_size += len;
- return true;
- }
+ if (vec_end_addr + 1 != page_addr + off)
+ return false;
+ if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
+ return false;
+
+ bv->bv_len += len;
+ bio->bi_iter.bi_size += len;
+ return true;
}
return false;
}
@@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- if (!__bio_try_merge_page(bio, page, len, offset)) {
+ if (!__bio_try_merge_page(bio, page, len, offset, false)) {
if (bio_full(bio))
return 0;
__bio_add_page(bio, page, len, offset);
@@ -1072,8 +1080,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
{
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
ret = copy_page_from_iter(bvec->bv_page,
@@ -1103,8 +1112,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
{
int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
ret = copy_page_to_iter(bvec->bv_page,
@@ -1126,8 +1136,9 @@ void bio_free_pages(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
__free_page(bvec->bv_page);
}
EXPORT_SYMBOL(bio_free_pages);
@@ -1295,6 +1306,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
struct bio *bio;
int ret;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
@@ -1368,7 +1380,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio;
out_unmap:
- bio_for_each_segment_all(bvec, bio, j) {
+ bio_for_each_segment_all(bvec, bio, j, iter_all) {
put_page(bvec->bv_page);
}
bio_put(bio);
@@ -1379,11 +1391,12 @@ static void __bio_unmap_user(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
/*
* make sure we dirty pages we wrote to
*/
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
@@ -1475,8 +1488,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private;
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
@@ -1585,8 +1599,9 @@ void bio_set_pages_dirty(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
}
@@ -1596,8 +1611,9 @@ static void bio_release_pages(struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
}
@@ -1644,8 +1660,9 @@ void bio_check_pages_dirty(struct bio *bio)
struct bio_vec *bvec;
unsigned long flags;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
goto defer;
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2bed5725aa03..77f37ef8ef06 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1269,7 +1269,7 @@ void blkcg_drain_queue(struct request_queue *q)
* blkcg_exit_queue - exit and release blkcg part of request_queue
* @q: request_queue being released
*
- * Called from blk_release_queue(). Responsible for exiting blkcg part.
+ * Called from blk_exit_queue(). Responsible for exiting blkcg part.
*/
void blkcg_exit_queue(struct request_queue *q)
{
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 71e9ac03f621..22467f475ab4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -161,6 +161,73 @@ static inline unsigned get_max_io_size(struct request_queue *q,
return sectors;
}
+static unsigned get_max_segment_size(struct request_queue *q,
+ unsigned offset)
+{
+ unsigned long mask = queue_segment_boundary(q);
+
+ /* default segment boundary mask means no boundary limit */
+ if (mask == BLK_SEG_BOUNDARY_MASK)
+ return queue_max_segment_size(q);
+
+ return min_t(unsigned long, mask - (mask & offset) + 1,
+ queue_max_segment_size(q));
+}
+
+/*
+ * Split the bvec @bv into segments, and update all kinds of
+ * variables.
+ */
+static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
+ unsigned *nsegs, unsigned *last_seg_size,
+ unsigned *front_seg_size, unsigned *sectors)
+{
+ unsigned len = bv->bv_len;
+ unsigned total_len = 0;
+ unsigned new_nsegs = 0, seg_size = 0;
+
+ /*
+ * Multi-page bvec may be too big to hold in one segment, so the
+ * current bvec has to be splitted as multiple segments.
+ */
+ while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
+ seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
+ seg_size = min(seg_size, len);
+
+ new_nsegs++;
+ total_len += seg_size;
+ len -= seg_size;
+
+ if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+ break;
+ }
+
+ if (!new_nsegs)
+ return !!len;
+
+ /* update front segment size */
+ if (!*nsegs) {
+ unsigned first_seg_size;
+
+ if (new_nsegs == 1)
+ first_seg_size = get_max_segment_size(q, bv->bv_offset);
+ else
+ first_seg_size = queue_max_segment_size(q);
+
+ if (*front_seg_size < first_seg_size)
+ *front_seg_size = first_seg_size;
+ }
+
+ /* update other varibles */
+ *last_seg_size = seg_size;
+ *nsegs += new_nsegs;
+ if (sectors)
+ *sectors += total_len >> 9;
+
+ /* split in the middle of the bvec if len != 0 */
+ return !!len;
+}
+
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -174,7 +241,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
- bio_for_each_segment(bv, bio, iter) {
+ bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
@@ -189,8 +256,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
*/
if (nsegs < queue_max_segments(q) &&
sectors < max_sectors) {
- nsegs++;
- sectors = max_sectors;
+ /* split in the middle of bvec */
+ bv.bv_len = (max_sectors - sectors) << 9;
+ bvec_split_segs(q, &bv, &nsegs,
+ &seg_size,
+ &front_seg_size,
+ &sectors);
}
goto split;
}
@@ -206,21 +277,28 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bvprvp = &bvprv;
sectors += bv.bv_len >> 9;
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
+
continue;
}
new_segment:
if (nsegs == queue_max_segments(q))
goto split;
- if (nsegs == 1 && seg_size > front_seg_size)
- front_seg_size = seg_size;
-
- nsegs++;
bvprv = bv;
bvprvp = &bvprv;
- seg_size = bv.bv_len;
- sectors += bv.bv_len >> 9;
+ if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+ nsegs++;
+ seg_size = bv.bv_len;
+ sectors += bv.bv_len >> 9;
+ if (nsegs == 1 && seg_size > front_seg_size)
+ front_seg_size = seg_size;
+ } else if (bvec_split_segs(q, &bv, &nsegs, &seg_size,
+ &front_seg_size, &sectors)) {
+ goto split;
+ }
}
do_split = false;
@@ -233,8 +311,6 @@ split:
bio = new;
}
- if (nsegs == 1 && seg_size > front_seg_size)
- front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size;
@@ -291,18 +367,20 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
EXPORT_SYMBOL(blk_queue_split);
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
- struct bio *bio,
- bool no_sg_merge)
+ struct bio *bio)
{
struct bio_vec bv, bvprv = { NULL };
int prev = 0;
unsigned int seg_size, nr_phys_segs;
+ unsigned front_seg_size;
struct bio *fbio, *bbio;
struct bvec_iter iter;
if (!bio)
return 0;
+ front_seg_size = bio->bi_seg_front_size;
+
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
@@ -316,14 +394,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
seg_size = 0;
nr_phys_segs = 0;
for_each_bio(bio) {
- bio_for_each_segment(bv, bio, iter) {
- /*
- * If SG merging is disabled, each bio vector is
- * a segment
- */
- if (no_sg_merge)
- goto new_segment;
-
+ bio_for_each_bvec(bv, bio, iter) {
if (prev) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
@@ -333,23 +404,23 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
seg_size += bv.bv_len;
bvprv = bv;
+
+ if (nr_phys_segs == 1 && seg_size >
+ front_seg_size)
+ front_seg_size = seg_size;
+
continue;
}
new_segment:
- if (nr_phys_segs == 1 && seg_size >
- fbio->bi_seg_front_size)
- fbio->bi_seg_front_size = seg_size;
-
- nr_phys_segs++;
bvprv = bv;
prev = 1;
- seg_size = bv.bv_len;
+ bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
+ &front_seg_size, NULL);
}
bbio = bio;
}
- if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
- fbio->bi_seg_front_size = seg_size;
+ fbio->bi_seg_front_size = front_seg_size;
if (seg_size > bbio->bi_seg_back_size)
bbio->bi_seg_back_size = seg_size;
@@ -358,33 +429,16 @@ new_segment:
void blk_recalc_rq_segments(struct request *rq)
{
- bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
- &rq->q->queue_flags);
-
- rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
- no_sg_merge);
+ rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
}
void blk_recount_segments(struct request_queue *q, struct bio *bio)
{
- unsigned short seg_cnt;
-
- /* estimate segment number by bi_vcnt for non-cloned bio */
- if (bio_flagged(bio, BIO_CLONED))
- seg_cnt = bio_segments(bio);
- else
- seg_cnt = bio->bi_vcnt;
+ struct bio *nxt = bio->bi_next;
- if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
- (seg_cnt < queue_max_segments(q)))
- bio->bi_phys_segments = seg_cnt;
- else {
- struct bio *nxt = bio->bi_next;
-
- bio->bi_next = NULL;
- bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
- bio->bi_next = nxt;
- }
+ bio->bi_next = NULL;
+ bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+ bio->bi_next = nxt;
bio_set_flag(bio, BIO_SEG_VALID);
}
@@ -407,6 +461,54 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
}
+static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
+ struct scatterlist *sglist)
+{
+ if (!*sg)
+ return sglist;
+
+ /*
+ * If the driver previously mapped a shorter list, we could see a
+ * termination bit prematurely unless it fully inits the sg table
+ * on each mapping. We KNOW that there must be more entries here
+ * or the driver would be buggy, so force clear the termination bit
+ * to avoid doing a full sg_init_table() in drivers for each command.
+ */
+ sg_unmark_end(*sg);
+ return sg_next(*sg);
+}
+
+static unsigned blk_bvec_map_sg(struct request_queue *q,
+ struct bio_vec *bvec, struct scatterlist *sglist,
+ struct scatterlist **sg)
+{
+ unsigned nbytes = bvec->bv_len;
+ unsigned nsegs = 0, total = 0, offset = 0;
+
+ while (nbytes > 0) {
+ unsigned seg_size;
+ struct page *pg;
+ unsigned idx;
+
+ *sg = blk_next_sg(sg, sglist);
+
+ seg_size = get_max_segment_size(q, bvec->bv_offset + total);
+ seg_size = min(nbytes, seg_size);
+
+ offset = (total + bvec->bv_offset) % PAGE_SIZE;
+ idx = (total + bvec->bv_offset) / PAGE_SIZE;
+ pg = bvec_nth_page(bvec->bv_page, idx);
+
+ sg_set_page(*sg, pg, seg_size, offset);
+
+ total += seg_size;
+ nbytes -= seg_size;
+ nsegs++;
+ }
+
+ return nsegs;
+}
+
static inline void
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
struct scatterlist *sglist, struct bio_vec *bvprv,
@@ -424,25 +526,12 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
(*sg)->length += nbytes;
} else {
new_segment:
- if (!*sg)
- *sg = sglist;
- else {
- /*
- * If the driver previously mapped a shorter
- * list, we could see a termination bit
- * prematurely unless it fully inits the sg
- * table on each mapping. We KNOW that there
- * must be more entries here or the driver
- * would be buggy, so force clear the
- * termination bit to avoid doing a full
- * sg_init_table() in drivers for each command.
- */
- sg_unmark_end(*sg);
- *sg = sg_next(*sg);
- }
-
- sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
- (*nsegs)++;
+ if (bvec->bv_offset + bvec->bv_len <= PAGE_SIZE) {
+ *sg = blk_next_sg(sg, sglist);
+ sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
+ (*nsegs) += 1;
+ } else
+ (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
}
*bvprv = *bvec;
}
@@ -464,7 +553,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
int nsegs = 0;
for_each_bio(bio)
- bio_for_each_segment(bvec, bio, iter)
+ bio_for_each_bvec(bvec, bio, iter)
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
&nsegs);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7921573aebbc..bac34b72b33b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -128,11 +128,9 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
- QUEUE_FLAG_NAME(NO_SG_MERGE),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
- QUEUE_FLAG_NAME(FLUSH_NQ),
QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(POLL_STATS),
@@ -251,7 +249,6 @@ static const char *const alloc_policy_name[] = {
static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED),
- HCTX_FLAG_NAME(SG_MERGE),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
};
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 140933e4a7d1..40905539afed 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -321,7 +321,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
bool ret = false;
enum hctx_type type;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2089c6c62f44..a4931fc7be8a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -170,7 +170,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
- data->ctx->cpu);
+ data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = &tags->breserved_tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9437a5eb07cf..4e502db8b10c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -364,7 +364,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
}
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->cmd_flags,
- data->ctx->cpu);
+ data->ctx);
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -2069,7 +2069,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags;
int node;
- node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -2125,7 +2125,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
size_t rq_size, left;
int node;
- node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -2424,7 +2424,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
for_each_possible_cpu(i) {
- hctx_idx = set->map[0].mq_map[i];
+ hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2434,16 +2434,19 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
- set->map[0].mq_map[i] = 0;
+ set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
}
ctx = per_cpu_ptr(q->queue_ctx, i);
for (j = 0; j < set->nr_maps; j++) {
- if (!set->map[j].nr_queues)
+ if (!set->map[j].nr_queues) {
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
+ HCTX_TYPE_DEFAULT, i);
continue;
+ }
hctx = blk_mq_map_queue_type(q, j, i);
-
+ ctx->hctxs[j] = hctx;
/*
* If the CPU is already set in the mask, then we've
* mapped this one already. This can happen if
@@ -2463,6 +2466,10 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/
BUG_ON(!hctx->nr_ctx);
}
+
+ for (; j < HCTX_MAX_TYPES; j++)
+ ctx->hctxs[j] = blk_mq_map_queue_type(q,
+ HCTX_TYPE_DEFAULT, i);
}
mutex_unlock(&q->sysfs_lock);
@@ -2734,7 +2741,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int node;
struct blk_mq_hw_ctx *hctx;
- node = blk_mq_hw_queue_to_node(&set->map[0], i);
+ node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
/*
* If the hw queue has been mapped to another numa node,
* we need to realloc the hctx. If allocation fails, fallback
@@ -2838,9 +2845,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- if (!(set->flags & BLK_MQ_F_SG_MERGE))
- blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
q->sg_reserved_size = INT_MAX;
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
@@ -2968,7 +2972,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
return set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
- return blk_mq_map_queues(&set->map[0]);
+ return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}
}
@@ -3090,6 +3094,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (!set)
return -EINVAL;
+ if (q->nr_requests == nr)
+ return 0;
+
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
@@ -3235,7 +3242,7 @@ fallback:
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues);
set->nr_hw_queues = prev_nr_hw_queues;
- blk_mq_map_queues(&set->map[0]);
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d0b3dd54ef8d..c11353a3749d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -23,6 +23,7 @@ struct blk_mq_ctx {
unsigned int cpu;
unsigned short index_hw[HCTX_MAX_TYPES];
+ struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
@@ -96,26 +97,23 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue
* @flags: request command flags
- * @cpu: CPU
+ * @cpu: cpu ctx
*/
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
unsigned int flags,
- unsigned int cpu)
+ struct blk_mq_ctx *ctx)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
- if ((flags & REQ_HIPRI) &&
- q->tag_set->nr_maps > HCTX_TYPE_POLL &&
- q->tag_set->map[HCTX_TYPE_POLL].nr_queues &&
- test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ /*
+ * The caller ensure that if REQ_HIPRI, poll must be enabled.
+ */
+ if (flags & REQ_HIPRI)
type = HCTX_TYPE_POLL;
-
- else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
- q->tag_set->nr_maps > HCTX_TYPE_READ &&
- q->tag_set->map[HCTX_TYPE_READ].nr_queues)
+ else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
- return blk_mq_map_queue_type(q, type, cpu);
+ return ctx->hctxs[type];
}
/*
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3e7038e475ee..6375afaedcec 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -799,15 +799,6 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
-void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
-{
- if (queueable)
- blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q);
- else
- blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q);
-}
-EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
-
/**
* blk_set_queue_depth - tell the block layer about the device queue depth
* @q: the request queue for the device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 590d1ef2f961..59685918167e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -468,6 +468,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
else if (val >= 0)
val *= 1000ULL;
+ if (wbt_get_min_lat(q) == val)
+ return count;
+
/*
* Ensure that the queue is idled, in case the latency update
* ends up either enabling or disabling wbt completely. We can't
@@ -817,21 +820,16 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
}
/**
- * __blk_release_queue - release a request queue when it is no longer needed
+ * __blk_release_queue - release a request queue
* @work: pointer to the release_work member of the request queue to be released
*
* Description:
- * blk_release_queue is the counterpart of blk_init_queue(). It should be
- * called when a request queue is being released; typically when a block
- * device is being de-registered. Its primary task it to free the queue
- * itself.
- *
- * Notes:
- * The low level driver must have finished any outstanding requests first
- * via blk_cleanup_queue().
- *
- * Although blk_release_queue() may be called with preemption disabled,
- * __blk_release_queue() may sleep.
+ * This function is called when a block device is being unregistered. The
+ * process of releasing a request queue starts with blk_cleanup_queue, which
+ * set the appropriate flags and then calls blk_put_queue, that decrements
+ * the reference counter of the request queue. Once the reference counter
+ * of the request queue reaches zero, blk_release_queue is called to release
+ * all allocated resources of the request queue.
*/
static void __blk_release_queue(struct work_struct *work)
{
diff --git a/block/blk.h b/block/blk.h
index 848278c52030..5d636ee41663 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -38,7 +38,7 @@ extern struct ida blk_queue_ida;
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
- return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq;
+ return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}
static inline void __blk_get_queue(struct request_queue *q)
diff --git a/block/bounce.c b/block/bounce.c
index ffb9e9ecfa7e..47eb7e936e22 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -165,11 +165,12 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
struct bio_vec *bvec, orig_vec;
int i;
struct bvec_iter orig_iter = bio_orig->bi_iter;
+ struct bvec_iter_all iter_all;
/*
* free up bounce indirect pages used
*/
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i, iter_all) {
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page != orig_vec.bv_page) {
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
@@ -313,7 +314,12 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
&bounce_bio_set);
- bio_for_each_segment_all(to, bio, i) {
+ /*
+ * Bvec table can't be updated by bio_for_each_segment_all(),
+ * so retrieve bvec from the table directly. This way is safe
+ * because the 'bio' is single-page bvec.
+ */
+ for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
struct page *page = to->bv_page;
if (page_to_pfn(page) <= q->limits.bounce_pfn)
diff --git a/block/elevator.c b/block/elevator.c
index f05e90d4e695..d6d835a08de6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -667,8 +667,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
/*
* Special case for mq, turn off scheduling
*/
- if (!strncmp(name, "none", 4))
+ if (!strncmp(name, "none", 4)) {
+ if (!q->elevator)
+ return 0;
return elevator_switch(q, NULL);
+ }
strlcpy(elevator_name, name, sizeof(elevator_name));
e = elevator_get(q, strstrip(elevator_name), true);
diff --git a/block/genhd.c b/block/genhd.c
index 1dd8fd6613b8..703267865f14 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -365,8 +365,8 @@ int register_blkdev(unsigned int major, const char *name)
}
if (index == 0) {
- printk("register_blkdev: failed to get major for %s\n",
- name);
+ printk("%s: failed to get major for %s\n",
+ __func__, name);
ret = -EBUSY;
goto out;
}
@@ -375,8 +375,8 @@ int register_blkdev(unsigned int major, const char *name)
}
if (major >= BLKDEV_MAJOR_MAX) {
- pr_err("register_blkdev: major requested (%u) is greater than the maximum (%u) for %s\n",
- major, BLKDEV_MAJOR_MAX-1, name);
+ pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
+ __func__, major, BLKDEV_MAJOR_MAX-1, name);
ret = -EINVAL;
goto out;
@@ -655,10 +655,12 @@ exit:
kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
disk_part_iter_exit(&piter);
- err = sysfs_create_link(&ddev->kobj,
- &disk->queue->backing_dev_info->dev->kobj,
- "bdi");
- WARN_ON(err);
+ if (disk->queue->backing_dev_info->dev) {
+ err = sysfs_create_link(&ddev->kobj,
+ &disk->queue->backing_dev_info->dev->kobj,
+ "bdi");
+ WARN_ON(err);
+ }
}
/**