diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bdev.c | 2 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 6 | ||||
-rw-r--r-- | block/bfq-iosched.c | 16 | ||||
-rw-r--r-- | block/blk-cgroup.c | 5 | ||||
-rw-r--r-- | block/blk-core.c | 148 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 1 | ||||
-rw-r--r-- | block/blk-mq.c | 22 | ||||
-rw-r--r-- | block/blk-settings.c | 20 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/genhd.c | 26 | ||||
-rw-r--r-- | block/kyber-iosched.c | 10 | ||||
-rw-r--r-- | block/partitions/core.c | 1 |
12 files changed, 163 insertions, 96 deletions
diff --git a/block/bdev.c b/block/bdev.c index cf2780cb44a7..485a258b0ab3 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -490,7 +490,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); - bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; bdev->bd_stats = alloc_percpu(struct disk_stats); @@ -498,6 +497,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) iput(inode); return NULL; } + bdev->bd_disk = disk; return bdev; } diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index e2f14508f2d6..85b8e1c3a762 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -666,6 +666,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfqg_and_blkg_put(bfqq_group(bfqq)); + if (entity->parent && + entity->parent->last_bfqq_created == bfqq) + entity->parent->last_bfqq_created = NULL; + else if (bfqd->last_bfqq_created == bfqq) + bfqd->last_bfqq_created = NULL; + entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index dd13c2bbc29c..480e1a134859 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2662,15 +2662,6 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; - /* - * The above assignment schedules the following redirections: - * each time some I/O for bfqq arrives, the process that - * generated that I/O is disassociated from bfqq and - * associated with new_bfqq. Here we increases new_bfqq->ref - * in advance, adding the number of processes that are - * expected to be associated with new_bfqq as they happen to - * issue I/O. - */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2733,10 +2724,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; - /* if a merge has already been setup, then proceed with that first */ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - /* * Check delayed stable merge for rotational or non-queueing * devs. For this branch to be executed, bfqq must not be @@ -2838,6 +2825,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL; + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 38b9f7684952..9a1c5839dd46 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1897,10 +1897,11 @@ void blk_cgroup_bio_start(struct bio *bio) { int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; + unsigned long flags; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); - u64_stats_update_begin(&bis->sync); + flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split @@ -1912,7 +1913,7 @@ void blk_cgroup_bio_start(struct bio *bio) } bis->cur.ios[rwd]++; - u64_stats_update_end(&bis->sync); + u64_stats_update_end_irqrestore(&bis->sync, flags); if (cgroup_subsys_on_dfl(io_cgrp_subsys)) cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); put_cpu(); diff --git a/block/blk-core.c b/block/blk-core.c index 5454db2fa263..4d8f5fe91588 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -49,7 +49,6 @@ #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" -#include "blk-rq-qos.h" struct dentry *blk_debugfs_root; @@ -337,23 +336,25 @@ void blk_put_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_put_queue); -void blk_set_queue_dying(struct request_queue *q) +void blk_queue_start_drain(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ blk_freeze_queue_start(q); - if (queue_is_mq(q)) blk_mq_wake_waiters(q); - /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); } + +void blk_set_queue_dying(struct request_queue *q) +{ + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); +} EXPORT_SYMBOL_GPL(blk_set_queue_dying); /** @@ -385,13 +386,8 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); - rq_qos_exit(q); - blk_queue_flag_set(QUEUE_FLAG_DEAD, q); - /* for synchronous bio-based driver finish in-flight integrity i/o */ - blk_flush_integrity(); - blk_sync_queue(q); if (queue_is_mq(q)) blk_mq_exit_queue(q); @@ -416,6 +412,30 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); +static bool blk_try_enter_queue(struct request_queue *q, bool pm) +{ + rcu_read_lock(); + if (!percpu_ref_tryget_live(&q->q_usage_counter)) + goto fail; + + /* + * The code that increments the pm_only counter must ensure that the + * counter is globally visible before the queue is unfrozen. + */ + if (blk_queue_pm_only(q) && + (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) + goto fail_put; + + rcu_read_unlock(); + return true; + +fail_put: + percpu_ref_put(&q->q_usage_counter); +fail: + rcu_read_unlock(); + return false; +} + /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -425,40 +445,18 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; - while (true) { - bool success = false; - - rcu_read_lock(); - if (percpu_ref_tryget_live(&q->q_usage_counter)) { - /* - * The code that increments the pm_only counter is - * responsible for ensuring that that counter is - * globally visible before the queue is unfrozen. - */ - if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) || - !blk_queue_pm_only(q)) { - success = true; - } else { - percpu_ref_put(&q->q_usage_counter); - } - } - rcu_read_unlock(); - - if (success) - return 0; - + while (!blk_try_enter_queue(q, pm)) { if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* - * read pair of barrier in blk_freeze_queue_start(), - * we need to order reading __PERCPU_REF_DEAD flag of - * .q_usage_counter and reading .mq_freeze_depth or - * queue dying flag, otherwise the following wait may - * never return if the two reads are reordered. + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. */ smp_rmb(); - wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || @@ -466,23 +464,43 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) if (blk_queue_dying(q)) return -ENODEV; } + + return 0; } static inline int bio_queue_enter(struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; - bool nowait = bio->bi_opf & REQ_NOWAIT; - int ret; + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct request_queue *q = disk->queue; - ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); - if (unlikely(ret)) { - if (nowait && !blk_queue_dying(q)) + while (!blk_try_enter_queue(q, false)) { + if (bio->bi_opf & REQ_NOWAIT) { + if (test_bit(GD_DEAD, &disk->state)) + goto dead; bio_wouldblock_error(bio); - else - bio_io_error(bio); + return -EBUSY; + } + + /* + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. + */ + smp_rmb(); + wait_event(q->mq_freeze_wq, + (!q->mq_freeze_depth && + blk_pm_resume_queue(false, q)) || + test_bit(GD_DEAD, &disk->state)); + if (test_bit(GD_DEAD, &disk->state)) + goto dead; } - return ret; + return 0; +dead: + bio_io_error(bio); + return -ENODEV; } void blk_queue_exit(struct request_queue *q) @@ -899,11 +917,18 @@ static blk_qc_t __submit_bio(struct bio *bio) struct gendisk *disk = bio->bi_bdev->bd_disk; blk_qc_t ret = BLK_QC_T_NONE; - if (blk_crypto_bio_prep(&bio)) { - if (!disk->fops->submit_bio) - return blk_mq_submit_bio(bio); + if (unlikely(bio_queue_enter(bio) != 0)) + return BLK_QC_T_NONE; + + if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio)) + goto queue_exit; + if (disk->fops->submit_bio) { ret = disk->fops->submit_bio(bio); + goto queue_exit; } + return blk_mq_submit_bio(bio); + +queue_exit: blk_queue_exit(disk->queue); return ret; } @@ -941,9 +966,6 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct bio_list lower, same; - if (unlikely(bio_queue_enter(bio) != 0)) - continue; - /* * Create a fresh bio_list for all subordinate requests. */ @@ -979,23 +1001,12 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; - blk_qc_t ret = BLK_QC_T_NONE; + blk_qc_t ret; current->bio_list = bio_list; do { - struct gendisk *disk = bio->bi_bdev->bd_disk; - - if (unlikely(bio_queue_enter(bio) != 0)) - continue; - - if (!blk_crypto_bio_prep(&bio)) { - blk_queue_exit(disk->queue); - ret = BLK_QC_T_NONE; - continue; - } - - ret = blk_mq_submit_bio(bio); + ret = __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; @@ -1013,9 +1024,6 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) */ blk_qc_t submit_bio_noacct(struct bio *bio) { - if (!submit_bio_checks(bio)) - return BLK_QC_T_NONE; - /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4b66d2776eda..3b38d15723de 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -129,6 +129,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), + QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(NOWAIT), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 108a352051be..652a31fc3bb3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -188,9 +188,11 @@ void blk_mq_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void blk_mq_unfreeze_queue(struct request_queue *q) +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { mutex_lock(&q->mq_freeze_lock); + if (force_atomic) + q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { @@ -199,6 +201,11 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } mutex_unlock(&q->mq_freeze_lock); } + +void blk_mq_unfreeze_queue(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* @@ -1318,6 +1325,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, int errors, queued; blk_status_t ret = BLK_STS_OK; LIST_HEAD(zone_list); + bool needs_resource = false; if (list_empty(list)) return false; @@ -1363,6 +1371,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, queued++; break; case BLK_STS_RESOURCE: + needs_resource = true; + fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; @@ -1373,6 +1383,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); + needs_resource = true; break; default: errors++; @@ -1399,7 +1410,6 @@ out: /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); - bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; if (nr_budgets) blk_mq_release_budgets(q, list); @@ -1440,14 +1450,16 @@ out: * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do - * similar if we couldn't get budget and SCHED_RESTART is set. + * similar if we couldn't get budget or couldn't lock a zone + * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); + if (prep == PREP_DISPATCH_NO_BUDGET) + needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE || - no_budget_avail)) + else if (needs_restart && needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); diff --git a/block/blk-settings.c b/block/blk-settings.c index a7c857ad7d10..b880c70e22e4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -842,6 +842,24 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); +static bool disk_has_partitions(struct gendisk *disk) +{ + unsigned long idx; + struct block_device *part; + bool ret = false; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part)) { + ret = true; + break; + } + } + rcu_read_unlock(); + + return ret; +} + /** * blk_queue_set_zoned - configure a disk queue zoned model. * @disk: the gendisk of the queue to configure @@ -876,7 +894,7 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) * we do nothing special as far as the block layer is concerned. */ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - !xa_empty(&disk->part_tbl)) + disk_has_partitions(disk)) model = BLK_ZONED_NONE; break; case BLK_ZONED_NONE: diff --git a/block/blk.h b/block/blk.h index 7d2a0ba7ed21..6c3c00a8fe19 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,6 +51,8 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +void blk_queue_start_drain(struct request_queue *q); #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, diff --git a/block/genhd.c b/block/genhd.c index 7b6e5e1cf956..ab12ae6e636e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -26,6 +26,7 @@ #include <linux/badblocks.h> #include "blk.h" +#include "blk-rq-qos.h" static struct kobject *block_depr; @@ -559,6 +560,8 @@ EXPORT_SYMBOL(device_add_disk); */ void del_gendisk(struct gendisk *disk) { + struct request_queue *q = disk->queue; + might_sleep(); if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) @@ -575,8 +578,17 @@ void del_gendisk(struct gendisk *disk) fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); + /* + * Fail any new I/O. + */ + set_bit(GD_DEAD, &disk->state); set_capacity(disk, 0); + /* + * Prevent new I/O from crossing bio_queue_enter(). + */ + blk_queue_start_drain(q); + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -598,6 +610,18 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); + + blk_mq_freeze_queue_wait(q); + + rq_qos_exit(q); + blk_sync_queue(q); + blk_flush_integrity(); + /* + * Allow using passthrough request again after the queue is torn down. + */ + blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + __blk_mq_unfreeze_queue(q, true); + } EXPORT_SYMBOL(del_gendisk); @@ -1056,6 +1080,7 @@ static void disk_release(struct device *dev) struct gendisk *disk = dev_to_disk(dev); might_sleep(); + WARN_ON_ONCE(disk_live(disk)); disk_release_events(disk); kfree(disk->random); @@ -1268,6 +1293,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, out_destroy_part_tbl: xa_destroy(&disk->part_tbl); + disk->part0->bd_disk = NULL; iput(disk->part0->bd_inode); out_free_bdi: bdi_put(disk->bdi); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 15a8be57203d..a0ffbabfac2c 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -151,6 +151,7 @@ struct kyber_ctx_queue { struct kyber_queue_data { struct request_queue *q; + dev_t dev; /* * Each scheduling domain has a limited number of in-flight requests @@ -257,7 +258,7 @@ static int calculate_percentile(struct kyber_queue_data *kqd, } memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); - trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain], kyber_latency_type_names[type], percentile, bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); @@ -270,7 +271,7 @@ static void kyber_resize_domain(struct kyber_queue_data *kqd, depth = clamp(depth, 1U, kyber_depth[sched_domain]); if (depth != kqd->domain_tokens[sched_domain].sb.depth) { sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); - trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain], depth); } } @@ -366,6 +367,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) goto err; kqd->q = q; + kqd->dev = disk_devt(q->disk); kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, GFP_KERNEL | __GFP_ZERO); @@ -774,7 +776,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { @@ -787,7 +789,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } diff --git a/block/partitions/core.c b/block/partitions/core.c index 58c4c362c94f..7bea19dd9458 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -423,6 +423,7 @@ out_del: device_del(pdev); out_put: put_device(pdev); + return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); |