From 86ff7c2a80cd357f6156a53b354f6a0b357dc0c9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jan 2018 22:04:57 -0500 Subject: blk-mq: introduce BLK_STS_DEV_RESOURCE This status is returned from driver to block layer if device related resource is unavailable, but driver can guarantee that IO dispatch will be triggered in future when the resource is available. Convert some drivers to return BLK_STS_DEV_RESOURCE. Also, if driver returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls. BLK_MQ_DELAY_QUEUE is 3 ms because both scsi-mq and nvmefc are using that magic value. If a driver can make sure there is in-flight IO, it is safe to return BLK_STS_DEV_RESOURCE because: 1) If all in-flight IOs complete before examining SCHED_RESTART in blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue is run immediately in this case by blk_mq_dispatch_rq_list(); 2) if there is any in-flight IO after/when examining SCHED_RESTART in blk_mq_dispatch_rq_list(): - if SCHED_RESTART isn't set, queue is run immediately as handled in 1) - otherwise, this request will be dispatched after any in-flight IO is completed via blk_mq_sched_restart() 3) if SCHED_RESTART is set concurently in context because of BLK_STS_RESOURCE, blk_mq_delay_run_hw_queue() will cover the above two cases and make sure IO hang can be avoided. One invariant is that queue will be rerun if SCHED_RESTART is set. Suggested-by: Jens Axboe Tested-by: Laurence Oberman Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-mq.c | 20 ++++++++++++++++---- drivers/block/null_blk.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/md/dm-rq.c | 5 ++--- drivers/nvme/host/fc.c | 12 ++---------- drivers/scsi/scsi_lib.c | 6 +++--- include/linux/blk_types.h | 18 ++++++++++++++++++ 9 files changed, 45 insertions(+), 23 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a2005a485335..134fd34b681f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -145,6 +145,7 @@ static const struct { [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, /* device mapper special case, should not leak out: */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 01f271d40825..df93102e2149 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1162,6 +1162,8 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, return true; } +#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ + bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool got_budget) { @@ -1169,6 +1171,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, struct request *rq, *nxt; bool no_tag = false; int errors, queued; + blk_status_t ret = BLK_STS_OK; if (list_empty(list)) return false; @@ -1181,7 +1184,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, errors = queued = 0; do { struct blk_mq_queue_data bd; - blk_status_t ret; rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { @@ -1226,7 +1228,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE) { + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { /* * If an I/O scheduler has been configured and we got a * driver tag for the next request already, free it @@ -1257,6 +1259,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * that is where we will continue on next queue run. */ if (!list_empty(list)) { + bool needs_restart; + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1280,10 +1284,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. + * + * If driver returns BLK_STS_RESOURCE and SCHED_RESTART + * bit is set, run queue after a delay to avoid IO stalls + * that could otherwise occur if the queue is idle. */ - if (!blk_mq_sched_needs_restart(hctx) || + needs_restart = blk_mq_sched_needs_restart(hctx); + if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); + else if (needs_restart && (ret == BLK_STS_RESOURCE)) + blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); } return (queued + errors) != 0; @@ -1764,6 +1775,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, *cookie = new_cookie; break; case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: __blk_mq_requeue_request(rq); break; default: @@ -1826,7 +1838,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_lock(hctx, &srcu_idx); ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); - if (ret == BLK_STS_RESOURCE) + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_sched_insert_request(rq, false, true, false); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 6655893a3a7a..287a09611c0f 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -1230,7 +1230,7 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) return BLK_STS_OK; } else /* requeue request */ - return BLK_STS_RESOURCE; + return BLK_STS_DEV_RESOURCE; } } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 68846897d213..79908e6ddbf2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -276,7 +276,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, /* Out of mem doesn't actually happen, since we fall back * to direct descriptors */ if (err == -ENOMEM || err == -ENOSPC) - return BLK_STS_RESOURCE; + return BLK_STS_DEV_RESOURCE; return BLK_STS_IOERR; } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 891265acb10e..e126e4cac2ca 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -911,7 +911,7 @@ out_err: out_busy: blk_mq_stop_hw_queue(hctx); spin_unlock_irqrestore(&rinfo->ring_lock, flags); - return BLK_STS_RESOURCE; + return BLK_STS_DEV_RESOURCE; } static void blkif_complete_rq(struct request *rq) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b7d175e94a02..348a0cb6963a 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -404,7 +404,7 @@ static blk_status_t dm_dispatch_clone_request(struct request *clone, struct requ clone->start_time = jiffies; r = blk_insert_cloned_request(clone->q, clone); - if (r != BLK_STS_OK && r != BLK_STS_RESOURCE) + if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); return r; @@ -496,7 +496,7 @@ check_again: trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); ret = dm_dispatch_clone_request(clone, rq); - if (ret == BLK_STS_RESOURCE) { + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_rq_unprep_clone(clone); tio->ti->type->release_clone_rq(clone); tio->clone = NULL; @@ -769,7 +769,6 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, /* Undo dm_start_request() before requeuing */ rq_end_stats(md, rq); rq_completed(md, rq_data_dir(rq), false); - blk_mq_delay_run_hw_queue(hctx, 100/*ms*/); return BLK_STS_RESOURCE; } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 99bf51c7e513..b856d7c919d2 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -35,8 +35,6 @@ enum nvme_fc_queue_flags { NVME_FC_Q_LIVE, }; -#define NVMEFC_QUEUE_DELAY 3 /* ms units */ - #define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */ struct nvme_fc_queue { @@ -2231,7 +2229,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * the target device is present */ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) - goto busy; + return BLK_STS_RESOURCE; if (!nvme_fc_ctrl_get(ctrl)) return BLK_STS_IOERR; @@ -2311,16 +2309,10 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, ret != -EBUSY) return BLK_STS_IOERR; - goto busy; + return BLK_STS_RESOURCE; } return BLK_STS_OK; - -busy: - if (!(op->flags & FCOP_FLAGS_AEN) && queue->hctx) - blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY); - - return BLK_STS_RESOURCE; } static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d9ca1dfab154..55be2550c555 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2030,9 +2030,9 @@ out_put_budget: case BLK_STS_OK: break; case BLK_STS_RESOURCE: - if (atomic_read(&sdev->device_busy) == 0 && - !scsi_device_blocked(sdev)) - blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); + if (atomic_read(&sdev->device_busy) || + scsi_device_blocked(sdev)) + ret = BLK_STS_DEV_RESOURCE; break; default: /* diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c5d3db0d83f8..bf18b95ed92d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -39,6 +39,24 @@ typedef u8 __bitwise blk_status_t; #define BLK_STS_AGAIN ((__force blk_status_t)12) +/* + * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if + * device related resources are unavailable, but the driver can guarantee + * that the queue will be rerun in the future once resources become + * available again. This is typically the case for device specific + * resources that are consumed for IO. If the driver fails allocating these + * resources, we know that inflight (or pending) IO will free these + * resource upon completion. + * + * This is different from BLK_STS_RESOURCE in that it explicitly references + * a device specific resource. For resources of wider scope, allocation + * failure can happen without having pending IO. This means that we can't + * rely on request completions freeing these resources, as IO may not be in + * flight. Examples of that are kernel memory allocations, DMA mappings, or + * any other system wide resources. + */ +#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with -- cgit From 445251d0f4d329aa061f323546cd6388a3bb7ab5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Feb 2018 14:01:02 -0700 Subject: blk-mq: fix discard merge with scheduler attached I ran into an issue on my laptop that triggered a bug on the discard path: WARNING: CPU: 2 PID: 207 at drivers/nvme/host/core.c:527 nvme_setup_cmd+0x3d3/0x430 Modules linked in: rfcomm fuse ctr ccm bnep arc4 binfmt_misc snd_hda_codec_hdmi nls_iso8859_1 nls_cp437 vfat snd_hda_codec_conexant fat snd_hda_codec_generic iwlmvm snd_hda_intel snd_hda_codec snd_hwdep mac80211 snd_hda_core snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq x86_pkg_temp_thermal intel_powerclamp kvm_intel uvcvideo iwlwifi btusb snd_seq_device videobuf2_vmalloc btintel videobuf2_memops kvm snd_timer videobuf2_v4l2 bluetooth irqbypass videobuf2_core aesni_intel aes_x86_64 crypto_simd cryptd snd glue_helper videodev cfg80211 ecdh_generic soundcore hid_generic usbhid hid i915 psmouse e1000e ptp pps_core xhci_pci xhci_hcd intel_gtt CPU: 2 PID: 207 Comm: jbd2/nvme0n1p7- Tainted: G U 4.15.0+ #176 Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET59W (1.33 ) 12/19/2017 RIP: 0010:nvme_setup_cmd+0x3d3/0x430 RSP: 0018:ffff880423e9f838 EFLAGS: 00010217 RAX: 0000000000000000 RBX: ffff880423e9f8c8 RCX: 0000000000010000 RDX: ffff88022b200010 RSI: 0000000000000002 RDI: 00000000327f0000 RBP: ffff880421251400 R08: ffff88022b200000 R09: 0000000000000009 R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000ffff R13: ffff88042341e280 R14: 000000000000ffff R15: ffff880421251440 FS: 0000000000000000(0000) GS:ffff880441500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055b684795030 CR3: 0000000002e09006 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: nvme_queue_rq+0x40/0xa00 ? __sbitmap_queue_get+0x24/0x90 ? blk_mq_get_tag+0xa3/0x250 ? wait_woken+0x80/0x80 ? blk_mq_get_driver_tag+0x97/0xf0 blk_mq_dispatch_rq_list+0x7b/0x4a0 ? deadline_remove_request+0x49/0xb0 blk_mq_do_dispatch_sched+0x4f/0xc0 blk_mq_sched_dispatch_requests+0x106/0x170 __blk_mq_run_hw_queue+0x53/0xa0 __blk_mq_delay_run_hw_queue+0x83/0xa0 blk_mq_run_hw_queue+0x6c/0xd0 blk_mq_sched_insert_request+0x96/0x140 __blk_mq_try_issue_directly+0x3d/0x190 blk_mq_try_issue_directly+0x30/0x70 blk_mq_make_request+0x1a4/0x6a0 generic_make_request+0xfd/0x2f0 ? submit_bio+0x5c/0x110 submit_bio+0x5c/0x110 ? __blkdev_issue_discard+0x152/0x200 submit_bio_wait+0x43/0x60 ext4_process_freed_data+0x1cd/0x440 ? account_page_dirtied+0xe2/0x1a0 ext4_journal_commit_callback+0x4a/0xc0 jbd2_journal_commit_transaction+0x17e2/0x19e0 ? kjournald2+0xb0/0x250 kjournald2+0xb0/0x250 ? wait_woken+0x80/0x80 ? commit_timeout+0x10/0x10 kthread+0x111/0x130 ? kthread_create_worker_on_cpu+0x50/0x50 ? do_group_exit+0x3a/0xa0 ret_from_fork+0x1f/0x30 Code: 73 89 c1 83 ce 10 c1 e1 10 09 ca 83 f8 04 0f 87 0f ff ff ff 8b 4d 20 48 8b 7d 00 c1 e9 09 48 01 8c c7 00 08 00 00 e9 f8 fe ff ff <0f> ff 4c 89 c7 41 bc 0a 00 00 00 e8 0d 78 d6 ff e9 a1 fc ff ff ---[ end trace 50d361cc444506c8 ]--- print_req_error: I/O error, dev nvme0n1, sector 847167488 Decoding the assembly, the request claims to have 0xffff segments, while nvme counts two. This turns out to be because we don't check for a data carrying request on the mq scheduler path, and since blk_phys_contig_segment() returns true for a non-data request, we decrement the initial segment count of 0 and end up with 0xffff in the unsigned short. There are a few issues here: 1) We should initialize the segment count for a discard to 1. 2) The discard merging is currently using the data limits for segments and sectors. Fix this up by having attempt_merge() correctly identify the request, and by initializing the segment count correctly for discards. This can only be triggered with mq-deadline on discard capable devices right now, which isn't a common configuration. Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk-merge.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 134fd34b681f..d0d104268f1a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3283,6 +3283,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, { if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); + else if (bio_op(bio) == REQ_OP_DISCARD) + rq->nr_phys_segments = 1; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; diff --git a/block/blk-merge.c b/block/blk-merge.c index 8452fc7164cc..782940c65d8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -550,6 +550,24 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } +static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, + struct request *next) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(next->bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next); + return true; +no_merge: + req_set_nomerge(q, req); + return false; +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -683,9 +701,13 @@ static struct request *attempt_merge(struct request_queue *q, * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn * will have updated segment counts, update sector - * counts here. + * counts here. Handle DISCARDs separately, as they + * have separate settings. */ - if (!ll_merge_requests_fn(q, req, next)) + if (req_op(req) == REQ_OP_DISCARD) { + if (!req_attempt_discard_merge(q, req, next)) + return NULL; + } else if (!ll_merge_requests_fn(q, req, next)) return NULL; /* @@ -715,7 +737,8 @@ static struct request *attempt_merge(struct request_queue *q, req->__data_len += blk_rq_bytes(next); - elv_merge_requests(q, req, next); + if (req_op(req) != REQ_OP_DISCARD) + elv_merge_requests(q, req, next); /* * 'next' is going away, so update stats accordingly -- cgit From bea99a500773fdfdb16b7dbfbaa00af7a6f0dc3b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 1 Feb 2018 14:41:15 -0700 Subject: blk-mq-sched: Enable merging discard bio into request Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 55c0a745b427..25c14c58385c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -259,6 +259,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, if (!*merged_request) elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); return true; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio); default: return false; } -- cgit