summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bfq-iosched.c135
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-iolatency.c3
-rw-r--r--block/blk-mq-sched.h9
-rw-r--r--block/blk-mq.c4
-rw-r--r--block/blk-rq-qos.c7
-rw-r--r--block/blk-settings.c3
-rw-r--r--block/blk-sysfs.c3
-rw-r--r--block/genhd.c2
9 files changed, 108 insertions, 67 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 72860325245a..b33be928d164 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1924,12 +1924,13 @@ static void bfq_add_request(struct request *rq)
* confirmed no later than during the next
* I/O-plugging interval for bfqq.
*/
- if (!bfq_bfqq_has_short_ttime(bfqq) &&
+ if (bfqd->last_completed_rq_bfqq &&
+ !bfq_bfqq_has_short_ttime(bfqq) &&
ktime_get_ns() - bfqd->last_completion <
200 * NSEC_PER_USEC) {
if (bfqd->last_completed_rq_bfqq != bfqq &&
- bfqd->last_completed_rq_bfqq !=
- bfqq->waker_bfqq) {
+ bfqd->last_completed_rq_bfqq !=
+ bfqq->waker_bfqq) {
/*
* First synchronization detected with
* a candidate waker queue, or with a
@@ -2250,9 +2251,14 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
blk_rq_pos(container_of(rb_prev(&req->rb_node),
struct request, rb_node))) {
struct bfq_queue *bfqq = bfq_init_rq(req);
- struct bfq_data *bfqd = bfqq->bfqd;
+ struct bfq_data *bfqd;
struct request *prev, *next_rq;
+ if (!bfqq)
+ return;
+
+ bfqd = bfqq->bfqd;
+
/* Reposition request in its sort_list */
elv_rb_del(&bfqq->sort_list, req);
elv_rb_add(&bfqq->sort_list, req);
@@ -2299,6 +2305,9 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct bfq_queue *bfqq = bfq_init_rq(rq),
*next_bfqq = bfq_init_rq(next);
+ if (!bfqq)
+ return;
+
/*
* If next and rq belong to the same bfq_queue and next is older
* than rq, then reposition rq in the fifo (by substituting next
@@ -3354,38 +3363,57 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
* there is no active group, then the primary expectation for
* this device is probably a high throughput.
*
- * We are now left only with explaining the additional
- * compound condition that is checked below for deciding
- * whether the scenario is asymmetric. To explain this
- * compound condition, we need to add that the function
+ * We are now left only with explaining the two sub-conditions in the
+ * additional compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain the first
+ * sub-condition, we need to add that the function
* bfq_asymmetric_scenario checks the weights of only
- * non-weight-raised queues, for efficiency reasons (see
- * comments on bfq_weights_tree_add()). Then the fact that
- * bfqq is weight-raised is checked explicitly here. More
- * precisely, the compound condition below takes into account
- * also the fact that, even if bfqq is being weight-raised,
- * the scenario is still symmetric if all queues with requests
- * waiting for completion happen to be
- * weight-raised. Actually, we should be even more precise
- * here, and differentiate between interactive weight raising
- * and soft real-time weight raising.
+ * non-weight-raised queues, for efficiency reasons (see comments on
+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
+ * is checked explicitly here. More precisely, the compound condition
+ * below takes into account also the fact that, even if bfqq is being
+ * weight-raised, the scenario is still symmetric if all queues with
+ * requests waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise here, and
+ * differentiate between interactive weight raising and soft real-time
+ * weight raising.
+ *
+ * The second sub-condition checked in the compound condition is
+ * whether there is a fair amount of already in-flight I/O not
+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
+ * following reason. The drive may decide to serve in-flight
+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
+ * basically uncontrolled amount of I/O from other queues may be
+ * dispatched too, possibly causing the service of bfqq's I/O to be
+ * delayed even longer in the drive. This problem gets more and more
+ * serious as the speed and the queue depth of the drive grow,
+ * because, as these two quantities grow, the probability to find no
+ * queue busy but many requests in flight grows too. By contrast,
+ * plugging I/O dispatching minimizes the delay induced by already
+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
+ * lose because of this delay.
*
* As a side note, it is worth considering that the above
- * device-idling countermeasures may however fail in the
- * following unlucky scenario: if idling is (correctly)
- * disabled in a time period during which all symmetry
- * sub-conditions hold, and hence the device is allowed to
- * enqueue many requests, but at some later point in time some
- * sub-condition stops to hold, then it may become impossible
- * to let requests be served in the desired order until all
- * the requests already queued in the device have been served.
+ * device-idling countermeasures may however fail in the following
+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
+ * in a time period during which all symmetry sub-conditions hold, and
+ * therefore the device is allowed to enqueue many requests, but at
+ * some later point in time some sub-condition stops to hold, then it
+ * may become impossible to make requests be served in the desired
+ * order until all the requests already queued in the device have been
+ * served. The last sub-condition commented above somewhat mitigates
+ * this problem for weight-raised queues.
*/
static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
return (bfqq->wr_coeff > 1 &&
- bfqd->wr_busy_queues <
- bfq_tot_busy_queues(bfqd)) ||
+ (bfqd->wr_busy_queues <
+ bfq_tot_busy_queues(bfqd) ||
+ bfqd->rq_in_driver >=
+ bfqq->dispatched + 4)) ||
bfq_asymmetric_scenario(bfqd, bfqq);
}
@@ -4745,6 +4773,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
*/
void bfq_put_queue(struct bfq_queue *bfqq)
{
+ struct bfq_queue *item;
+ struct hlist_node *n;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group *bfqg = bfqq_group(bfqq);
#endif
@@ -4789,6 +4819,36 @@ void bfq_put_queue(struct bfq_queue *bfqq)
bfqq->bfqd->burst_size--;
}
+ /*
+ * bfqq does not exist any longer, so it cannot be woken by
+ * any other queue, and cannot wake any other queue. Then bfqq
+ * must be removed from the woken list of its possible waker
+ * queue, and all queues in the woken list of bfqq must stop
+ * having a waker queue. Strictly speaking, these updates
+ * should be performed when bfqq remains with no I/O source
+ * attached to it, which happens before bfqq gets freed. In
+ * particular, this happens when the last process associated
+ * with bfqq exits or gets associated with a different
+ * queue. However, both events lead to bfqq being freed soon,
+ * and dangling references would come out only after bfqq gets
+ * freed. So these updates are done here, as a simple and safe
+ * way to handle all cases.
+ */
+ /* remove bfqq from woken list */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+
+ /* reset waker for all queues in woken list */
+ hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
+ woken_list_node) {
+ item->waker_bfqq = NULL;
+ bfq_clear_bfqq_has_waker(item);
+ hlist_del_init(&item->woken_list_node);
+ }
+
+ if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
+ bfqq->bfqd->last_completed_rq_bfqq = NULL;
+
kmem_cache_free(bfq_pool, bfqq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
bfqg_and_blkg_put(bfqg);
@@ -4816,9 +4876,6 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
- struct bfq_queue *item;
- struct hlist_node *n;
-
if (bfqq == bfqd->in_service_queue) {
__bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
bfq_schedule_dispatch(bfqd);
@@ -4828,18 +4885,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_cooperator(bfqq);
- /* remove bfqq from woken list */
- if (!hlist_unhashed(&bfqq->woken_list_node))
- hlist_del_init(&bfqq->woken_list_node);
-
- /* reset waker for all queues in woken list */
- hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
- woken_list_node) {
- item->waker_bfqq = NULL;
- bfq_clear_bfqq_has_waker(item);
- hlist_del_init(&item->woken_list_node);
- }
-
bfq_put_queue(bfqq); /* release process reference */
}
@@ -5417,12 +5462,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
- if (at_head || blk_rq_is_passthrough(rq)) {
+ if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
- } else { /* bfqq is assumed to be non null here */
+ } else {
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
* Update bfqq, because, if a queue merge has occurred
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 24ed26957367..55a7dc227dfb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -54,7 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
-static bool blkcg_debug_stats = false;
+bool blkcg_debug_stats = false;
static struct workqueue_struct *blkcg_punt_bio_wq;
static bool blkcg_policy_enabled(struct request_queue *q,
@@ -944,10 +944,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
dbytes, dios);
}
- if (!blkcg_debug_stats)
- goto next;
-
- if (atomic_read(&blkg->use_delay)) {
+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
has_stats = true;
off += scnprintf(buf+off, size-off,
" use_delay=%d delay_nsec=%llu",
@@ -967,7 +964,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
has_stats = true;
off += written;
}
-next:
+
if (has_stats) {
if (off < size - 1) {
off += scnprintf(buf+off, size-off, "\n");
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index d973c38ee4fd..0fff7b56df0e 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -917,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
unsigned long long avg_lat;
unsigned long long cur_win;
+ if (!blkcg_debug_stats)
+ return 0;
+
if (iolat->ssd)
return iolatency_ssd_stat(iolat, buf, size);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index cf22ab00fefb..126021fc3a11 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -61,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
e->type->ops.completed_request(rq, now);
}
-static inline void blk_mq_sched_started_request(struct request *rq)
-{
- struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
-
- if (e && e->type->ops.started_request)
- e->type->ops.started_request(rq);
-}
-
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f567146f9ed7..e0b849bfe74d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -669,8 +669,6 @@ void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
- blk_mq_sched_started_request(rq);
-
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
@@ -2664,8 +2662,6 @@ void blk_mq_release(struct request_queue *q)
struct blk_mq_hw_ctx *hctx, *next;
int i;
- cancel_delayed_work_sync(&q->requeue_work);
-
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 659ccb8b693f..3954c0dc1443 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -202,6 +202,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
return -1;
data->got_token = true;
+ smp_wmb();
list_del_init(&curr->entry);
wake_up_process(data->task);
return 1;
@@ -244,7 +245,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
return;
prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
+ has_sleeper = !wq_has_single_sleeper(&rqw->wait);
do {
+ /* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
@@ -255,12 +258,14 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
* which means we now have two. Put our local token
* and wake anyone else potentially waiting for one.
*/
+ smp_rmb();
if (data.got_token)
cleanup_cb(rqw, private_data);
break;
}
io_schedule();
- has_sleeper = false;
+ has_sleeper = true;
+ set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(&rqw->wait, &data.wq);
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 2ae348c101a0..2c1831207a8f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -752,7 +752,8 @@ void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
* page (which might not be idential to the Linux PAGE_SIZE). Because
* of that they are not limited by our notion of "segment size".
*/
- q->limits.max_segment_size = UINT_MAX;
+ if (mask)
+ q->limits.max_segment_size = UINT_MAX;
}
EXPORT_SYMBOL(blk_queue_virt_boundary);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 977c659dcd18..9bfa3ea4ed63 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -892,6 +892,9 @@ static void __blk_release_queue(struct work_struct *work)
blk_free_queue_stats(q->stats);
+ if (queue_is_mq(q))
+ cancel_delayed_work_sync(&q->requeue_work);
+
blk_exit_queue(q);
blk_queue_free_zone_bitmaps(q);
diff --git a/block/genhd.c b/block/genhd.c
index 97887e59f3b2..54f1f0d381f4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1969,7 +1969,7 @@ static const struct attribute *disk_events_attrs[] = {
* The default polling interval can be specified by the kernel
* parameter block.events_dfl_poll_msecs which defaults to 0
* (disable). This can also be modified runtime by writing to
- * /sys/module/block/events_dfl_poll_msecs.
+ * /sys/module/block/parameters/events_dfl_poll_msecs.
*/
static int disk_events_set_dfl_poll_msecs(const char *val,
const struct kernel_param *kp)