diff options
Diffstat (limited to 'block/blk-core.c')
| -rw-r--r-- | block/blk-core.c | 265 |
1 files changed, 174 insertions, 91 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index 00c74330fa92..8387fe50ea15 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -49,6 +49,7 @@ #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" +#include "blk-ioprio.h" struct dentry *blk_debugfs_root; @@ -93,20 +94,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL(blk_queue_flag_clear); -/** - * blk_queue_flag_test_and_set - atomically test and set a queue flag - * @flag: flag to be set - * @q: request queue - * - * Returns the previous value of @flag - 0 if the flag was not set and 1 if - * the flag was already set. - */ -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) -{ - return test_and_set_bit(flag, &q->queue_flags); -} -EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), @@ -155,7 +142,7 @@ static const struct { [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, - [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, @@ -170,6 +157,11 @@ static const struct { [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, + /* Command duration limit device-side timeout */ + [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, + + [BLK_STS_INVAL] = { -EINVAL, "invalid" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -205,6 +197,7 @@ const char *blk_status_to_str(blk_status_t status) return "<null>"; return blk_errors[idx].name; } +EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue @@ -226,7 +219,7 @@ const char *blk_status_to_str(blk_status_t status) */ void blk_sync_queue(struct request_queue *q) { - del_timer_sync(&q->timeout); + timer_delete_sync(&q->timeout); cancel_work_sync(&q->timeout_work); } EXPORT_SYMBOL(blk_sync_queue); @@ -268,6 +261,8 @@ static void blk_free_queue(struct request_queue *q) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); + lockdep_unregister_key(&q->io_lock_cls_key); + lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -285,18 +280,20 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_queue_start_drain(struct request_queue *q) +bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ - blk_freeze_queue_start(q); + bool freeze = __blk_freeze_queue_start(q, current); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); + + return freeze; } /** @@ -328,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) return -ENODEV; } + rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } @@ -359,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio) goto dead; } + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); @@ -380,7 +381,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) static void blk_rq_timed_out_timer(struct timer_list *t) { - struct request_queue *q = from_timer(q, t, timeout); + struct request_queue *q = timer_container_of(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } @@ -389,24 +390,34 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) { struct request_queue *q; + int error; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) - return NULL; + return ERR_PTR(-ENOMEM); q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); - if (q->id < 0) + if (q->id < 0) { + error = q->id; goto fail_q; + } q->stats = blk_alloc_queue_stats(); - if (!q->stats) + if (!q->stats) { + error = -ENOMEM; goto fail_id; + } + + error = blk_set_default_limits(lim); + if (error) + goto fail_stats; + q->limits = *lim; q->node = node_id; @@ -418,23 +429,39 @@ struct request_queue *blk_alloc_queue(int node_id) refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); + mutex_init(&q->elevator_lock); mutex_init(&q->sysfs_lock); - mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->limits_lock); + mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ - if (percpu_ref_init(&q->q_usage_counter, + error = percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); + if (error) goto fail_stats; + lockdep_register_key(&q->io_lock_cls_key); + lockdep_register_key(&q->q_lock_cls_key); + lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", + &q->io_lock_cls_key, 0); + lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", + &q->q_lock_cls_key, 0); + + /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */ + fs_reclaim_acquire(GFP_KERNEL); + rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); + rwsem_release(&q->io_lockdep_map, _RET_IP_); + fs_reclaim_release(GFP_KERNEL); - blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; @@ -445,7 +472,7 @@ fail_id: ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); - return NULL; + return ERR_PTR(error); } /** @@ -477,7 +504,8 @@ __setup("fail_make_request=", setup_fail_make_request); bool should_fail_request(struct block_device *part, unsigned int bytes) { - return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); + return bdev_test_flag(part, BD_MAKE_IT_FAIL) && + should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -496,13 +524,22 @@ static inline void bio_check_ro(struct bio *bio) if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return; + + if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED)) + return; + + bdev_set_flag(bio->bi_bdev, BD_RO_WARNED); + + /* + * Use ioctl to set underlying disk of raid/dm to read-only + * will trigger this. + */ pr_warn("Trying to write to read-only block-device %pg\n", bio->bi_bdev); - /* Older lvm-tools actually trigger this */ } } -static noinline int should_fail_bio(struct bio *bio) +int should_fail_bio(struct bio *bio) { if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; @@ -520,9 +557,11 @@ static inline int bio_check_eod(struct bio *bio) sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); - if (nr_sectors && maxsector && + if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { + if (!maxsector) + return -EIO; pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", current->comm, bio->bi_bdev, bio->bi_opf, @@ -564,8 +603,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || - !bio_zone_is_seq(bio)) + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* @@ -587,17 +625,30 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static void __submit_bio(struct bio *bio) { + /* If plug is not used, add new plug here to cache nsecs time. */ + struct blk_plug plug; + if (unlikely(!blk_crypto_bio_prep(&bio))) return; - if (!bio->bi_bdev->bd_has_submit_bio) { + blk_start_plug(&plug); + + if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; - - disk->fops->submit_bio(bio); + + if ((bio->bi_opf & REQ_POLLED) && + !(disk->queue->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + } else { + disk->fops->submit_bio(bio); + } blk_queue_exit(disk->queue); } + + blk_finish_plug(&plug); } /* @@ -611,13 +662,13 @@ static void __submit_bio(struct bio *bio) * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. - * - In this case we really did just take the bio of the top of the list (no + * - In this case we really did just take the bio off the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current - * ->submit_bio, but that haven't been processed yet. + * ->submit_bio(), but that haven't been processed yet. */ static void __submit_bio_noacct(struct bio *bio) { @@ -676,10 +727,9 @@ static void __submit_bio_noacct_mq(struct bio *bio) current->bio_list = NULL; } -void submit_bio_noacct_nocheck(struct bio *bio) +void submit_bio_noacct_nocheck(struct bio *bio, bool split) { blk_cgroup_bio_start(bio); - blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(bio); @@ -693,15 +743,31 @@ void submit_bio_noacct_nocheck(struct bio *bio) /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list - * to collect a list of requests submited by a ->submit_bio method while - * it is active, and then process them after it returned. + * to collect a list of requests submitted by a ->submit_bio method + * while it is active, and then process them after it returned. */ - if (current->bio_list) - bio_list_add(¤t->bio_list[0], bio); - else if (!bio->bi_bdev->bd_has_submit_bio) + if (current->bio_list) { + if (split) + bio_list_add_head(¤t->bio_list[0], bio); + else + bio_list_add(¤t->bio_list[0], bio); + } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { __submit_bio_noacct_mq(bio); - else + } else { __submit_bio_noacct(bio); + } +} + +static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, + struct bio *bio) +{ + if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) + return BLK_STS_INVAL; + + if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) + return BLK_STS_INVAL; + + return BLK_STS_OK; } /** @@ -718,14 +784,9 @@ void submit_bio_noacct(struct bio *bio) struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; - struct blk_plug *plug; might_sleep(); - plug = blk_mq_plug(bio); - if (plug && plug->nowait) - bio->bi_opf |= REQ_NOWAIT; - /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. @@ -739,7 +800,8 @@ void submit_bio_noacct(struct bio *bio) if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; - if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) + if (bdev_is_partition(bdev) && + unlikely(blk_partition_remap(bio))) goto end_io; } @@ -751,7 +813,7 @@ void submit_bio_noacct(struct bio *bio) if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + if (!bdev_write_cache(bdev)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; @@ -760,10 +822,22 @@ void submit_bio_noacct(struct bio *bio) } } - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - bio_clear_polled(bio); - switch (bio_op(bio)) { + case REQ_OP_READ: + break; + case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + status = blk_validate_atomic_write_op_size(q, bio); + if (status != BLK_STS_OK) + goto end_io; + } + break; + case REQ_OP_FLUSH: + /* + * REQ_OP_FLUSH can't be submitted through bios, it is only + * synthetized in struct request by the flush state machine. + */ + goto not_supported; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; @@ -777,28 +851,32 @@ void submit_bio_noacct(struct bio *bio) if (status != BLK_STS_OK) goto end_io; break; + case REQ_OP_WRITE_ZEROES: + if (!q->limits.max_write_zeroes_sectors) + goto not_supported; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - if (!bdev_is_zoned(bio->bi_bdev)) - goto not_supported; - break; case REQ_OP_ZONE_RESET_ALL: - if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) - goto not_supported; - break; - case REQ_OP_WRITE_ZEROES: - if (!q->limits.max_write_zeroes_sectors) + if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: + /* + * Driver private operations are only used with passthrough + * requests. + */ + fallthrough; default: - break; + goto not_supported; } if (blk_throtl_bio(bio)) return; - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); return; not_supported: @@ -809,13 +887,21 @@ end_io: } EXPORT_SYMBOL(submit_bio_noacct); +static void bio_set_ioprio(struct bio *bio) +{ + /* Nobody set ioprio so far? Initialize it based on task's nice value */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) + bio->bi_ioprio = get_current_ioprio(); + blkcg_set_ioprio(bio); +} + /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_bdev field. + * bio will be sent to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback @@ -831,6 +917,7 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } + bio_set_ioprio(bio); submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); @@ -859,16 +946,9 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (cookie == BLK_QC_T_NONE) return 0; - /* - * As the requests that require a zone lock are not plugged in the - * first place, directly accessing the plug instead of using - * blk_mq_plug() should not have any consequences during flushing for - * zoned devices. - */ blk_flush_plug(current->plug, false); /* @@ -887,7 +967,8 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) } else { struct gendisk *disk = q->disk; - if (disk && disk->fops->poll_bio) + if ((q->limits.features & BLK_FEAT_POLL) && disk && + disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); @@ -910,7 +991,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, * point to a freshly allocated bio at this point. If that happens * we have a few cases to consider: * - * 1) the bio is beeing initialized and bi_bdev is NULL. We can just + * 1) the bio is being initialized and bi_bdev is NULL. We can just * simply nothing in this case * 2) the bio points to a not poll enabled device. bio_poll will catch * this and return 0 @@ -940,11 +1021,12 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end) unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) - __part_stat_add(part, io_ticks, end ? now - stamp : 1); - } - if (part->bd_partno) { + if (unlikely(time_after(now, stamp)) && + likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && + (end || bdev_count_inflight(part))) + __part_stat_add(part, io_ticks, now - stamp); + + if (bdev_is_partition(part)) { part = bdev_whole(part); goto again; } @@ -1049,13 +1131,13 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return; - plug->mq_list = NULL; - plug->cached_rq = NULL; + plug->cur_ktime = 0; + rq_list_init(&plug->mq_list); + rq_list_init(&plug->cached_rqs); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; plug->has_elevator = false; - plug->nowait = false; INIT_LIST_HEAD(&plug->cb_list); /* @@ -1140,16 +1222,18 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); - if (!rq_list_empty(plug->mq_list)) - blk_mq_flush_plug_list(plug, from_schedule); + blk_mq_flush_plug_list(plug, from_schedule); /* * Unconditionally flush out cached requests, even if the unplug * event came from schedule. Since we know hold references to the * queue for cached requests, we don't want a blocked task holding * up a queue freeze/quiesce event. */ - if (unlikely(!rq_list_empty(plug->cached_rq))) + if (unlikely(!rq_list_empty(&plug->cached_rqs))) blk_mq_free_plug_rqs(plug); + + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; } /** @@ -1197,8 +1281,7 @@ int __init blk_dev_init(void) if (!kblockd_workqueue) panic("Failed to create kblockd\n"); - blk_requestq_cachep = kmem_cache_create("request_queue", - sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); blk_debugfs_root = debugfs_create_dir("block", NULL); |
