summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bdev.c12
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-core.c9
-rw-r--r--block/blk-flush.c12
-rw-r--r--block/blk-ia-ranges.c4
-rw-r--r--block/blk-mq-sched.c4
-rw-r--r--block/blk-mq.c69
-rw-r--r--block/blk-mq.h30
-rw-r--r--block/blk-sysfs.c10
-rw-r--r--block/blk-zoned.c15
-rw-r--r--block/blk.h2
-rw-r--r--block/elevator.c10
-rw-r--r--block/genhd.c8
-rw-r--r--block/ioctl.c24
-rw-r--r--block/ioprio.c9
15 files changed, 142 insertions, 85 deletions
diff --git a/block/bdev.c b/block/bdev.c
index b4dab2fb6a74..b1d087e5e205 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -753,8 +753,7 @@ struct block_device *blkdev_get_no_open(dev_t dev)
if (!bdev)
return NULL;
- if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
- !try_module_get(bdev->bd_disk->fops->owner)) {
+ if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN)) {
put_device(&bdev->bd_device);
return NULL;
}
@@ -764,7 +763,6 @@ struct block_device *blkdev_get_no_open(dev_t dev)
void blkdev_put_no_open(struct block_device *bdev)
{
- module_put(bdev->bd_disk->fops->owner);
put_device(&bdev->bd_device);
}
@@ -820,12 +818,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
ret = -ENXIO;
if (!disk_live(disk))
goto abort_claiming;
+ if (!try_module_get(disk->fops->owner))
+ goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
- goto abort_claiming;
+ goto put_module;
if (mode & FMODE_EXCL) {
bd_finish_claiming(bdev, holder);
@@ -847,7 +847,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
if (unblock_events)
disk_unblock_events(disk);
return bdev;
-
+put_module:
+ module_put(disk->fops->owner);
abort_claiming:
if (mode & FMODE_EXCL)
bd_abort_claiming(bdev, holder);
@@ -956,6 +957,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
blkdev_put_whole(bdev, mode);
mutex_unlock(&disk->open_mutex);
+ module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
}
EXPORT_SYMBOL(blkdev_put);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 88b1fce90520..663aabfeba18 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -640,7 +640,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
*/
ret = blk_queue_enter(q, 0);
if (ret)
- return ret;
+ goto fail;
rcu_read_lock();
spin_lock_irq(&q->queue_lock);
@@ -676,13 +676,13 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto fail;
+ goto fail_exit_queue;
}
if (radix_tree_preload(GFP_KERNEL)) {
blkg_free(new_blkg);
ret = -ENOMEM;
- goto fail;
+ goto fail_exit_queue;
}
rcu_read_lock();
@@ -722,9 +722,10 @@ fail_preloaded:
fail_unlock:
spin_unlock_irq(&q->queue_lock);
rcu_read_unlock();
+fail_exit_queue:
+ blk_queue_exit(q);
fail:
blkdev_put_no_open(bdev);
- blk_queue_exit(q);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
diff --git a/block/blk-core.c b/block/blk-core.c
index b043de2baaac..1378d084c770 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -363,8 +363,10 @@ void blk_cleanup_queue(struct request_queue *q)
blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
blk_sync_queue(q);
- if (queue_is_mq(q))
+ if (queue_is_mq(q)) {
+ blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
+ }
/*
* In theory, request pool of sched_tags belongs to request queue.
@@ -809,10 +811,8 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio)
if (unlikely(!current->io_context))
create_task_io_context(current, GFP_ATOMIC, q->node);
- if (blk_throtl_bio(bio)) {
- blkcg_bio_issue_init(bio);
+ if (blk_throtl_bio(bio))
return false;
- }
blk_cgroup_bio_start(bio);
blkcg_bio_issue_init(bio);
@@ -1017,6 +1017,7 @@ EXPORT_SYMBOL(submit_bio);
/**
* bio_poll - poll for BIO completions
* @bio: bio to poll for
+ * @iob: batches of IO
* @flags: BLK_POLL_* flags that control the behavior
*
* Poll for completions on queue associated with the bio. Returns number of
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8e364bda5166..1fce6d16e6d3 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*/
-bool blk_insert_flush(struct request *rq)
+void blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
@@ -409,7 +409,7 @@ bool blk_insert_flush(struct request *rq)
*/
if (!policy) {
blk_mq_end_request(rq, 0);
- return true;
+ return;
}
BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
@@ -420,8 +420,10 @@ bool blk_insert_flush(struct request *rq)
* for normal execution.
*/
if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))
- return false;
+ !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+ blk_mq_request_bypass_insert(rq, false, true);
+ return;
+ }
/*
* @rq should go through flush machinery. Mark it part of flush
@@ -437,8 +439,6 @@ bool blk_insert_flush(struct request *rq)
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
-
- return true;
}
/**
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index c246c425d0d7..b925f3db3ab7 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -104,8 +104,8 @@ static struct kobj_type blk_ia_ranges_ktype = {
};
/**
- * disk_register_ia_ranges - register with sysfs a set of independent
- * access ranges
+ * disk_register_independent_access_ranges - register with sysfs a set of
+ * independent access ranges
* @disk: Target disk
* @new_iars: New set of independent access ranges
*
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4be652fa38e7..ba21449439cc 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -370,9 +370,6 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
bool ret = false;
enum hctx_type type;
- if (bio_queue_enter(bio))
- return false;
-
if (e && e->type->ops.bio_merge) {
ret = e->type->ops.bio_merge(q, bio, nr_segs);
goto out_put;
@@ -397,7 +394,6 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
spin_unlock(&ctx->lock);
out_put:
- blk_queue_exit(q);
return ret;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 629cf421417f..8874a63ae952 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -860,13 +860,14 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
if (iob->need_ts)
__blk_mq_end_request_acct(rq, now);
+ rq_qos_done(rq->q, rq);
+
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (!refcount_dec_and_test(&rq->ref))
continue;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
- rq_qos_done(rq->q, rq);
if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
if (cur_hctx)
@@ -2495,8 +2496,9 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
return BLK_MAX_REQUEST_COUNT;
}
-static bool blk_attempt_bio_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, bool *same_queue_rq)
+static bool blk_mq_attempt_bio_merge(struct request_queue *q,
+ struct bio *bio, unsigned int nr_segs,
+ bool *same_queue_rq)
{
if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq))
@@ -2520,12 +2522,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
};
struct request *rq;
- if (unlikely(bio_queue_enter(bio)))
+ if (blk_mq_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
return NULL;
- if (unlikely(!submit_bio_checks(bio)))
- goto put_exit;
- if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
- goto put_exit;
rq_qos_throttle(q, bio);
@@ -2542,26 +2540,42 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
-put_exit:
- blk_queue_exit(q);
+
return NULL;
}
+static inline bool blk_mq_can_use_cached_rq(struct request *rq, struct bio *bio)
+{
+ if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
+ return false;
+
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
+ return false;
+
+ return true;
+}
+
static inline struct request *blk_mq_get_request(struct request_queue *q,
struct blk_plug *plug,
struct bio *bio,
unsigned int nsegs,
bool *same_queue_rq)
{
- if (plug) {
- struct request *rq;
+ struct request *rq;
+ bool checked = false;
+ if (plug) {
rq = rq_list_peek(&plug->cached_rq);
if (rq && rq->q == q) {
if (unlikely(!submit_bio_checks(bio)))
return NULL;
- if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
+ if (blk_mq_attempt_bio_merge(q, bio, nsegs,
+ same_queue_rq))
return NULL;
+ checked = true;
+ if (!blk_mq_can_use_cached_rq(rq, bio))
+ goto fallback;
+ rq->cmd_flags = bio->bi_opf;
plug->cached_rq = rq_list_next(rq);
INIT_LIST_HEAD(&rq->queuelist);
rq_qos_throttle(q, bio);
@@ -2569,7 +2583,17 @@ static inline struct request *blk_mq_get_request(struct request_queue *q,
}
}
- return blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq);
+fallback:
+ if (unlikely(bio_queue_enter(bio)))
+ return NULL;
+ if (unlikely(!checked && !submit_bio_checks(bio)))
+ goto out_put;
+ rq = blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq);
+ if (rq)
+ return rq;
+out_put:
+ blk_queue_exit(q);
+ return NULL;
}
/**
@@ -2624,8 +2648,10 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
- if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
+ if (op_is_flush(bio->bi_opf)) {
+ blk_insert_flush(rq);
return;
+ }
if (plug && (q->nr_hw_queues == 1 ||
blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
@@ -4394,6 +4420,19 @@ unsigned int blk_mq_rq_cpu(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_rq_cpu);
+void blk_mq_cancel_work_sync(struct request_queue *q)
+{
+ if (queue_is_mq(q)) {
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ cancel_delayed_work_sync(&q->requeue_work);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
+ }
+}
+
static int __init blk_mq_init(void)
{
int i;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index cb0b5482ca5e..afcf9931a489 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -89,15 +89,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
}
-/*
- * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
- * @q: request queue
- * @flags: request command flags
- * @ctx: software queue cpu ctx
- */
-static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- unsigned int flags,
- struct blk_mq_ctx *ctx)
+static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
@@ -108,8 +100,20 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
-
- return ctx->hctxs[type];
+ return type;
+}
+
+/*
+ * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
+ * @q: request queue
+ * @flags: request command flags
+ * @ctx: software queue cpu ctx
+ */
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
+ unsigned int flags,
+ struct blk_mq_ctx *ctx)
+{
+ return ctx->hctxs[blk_mq_get_hctx_type(flags)];
}
/*
@@ -124,6 +128,8 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
+void blk_mq_cancel_work_sync(struct request_queue *q);
+
void blk_mq_release(struct request_queue *q);
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
@@ -149,7 +155,7 @@ struct blk_mq_alloc_data {
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
unsigned int cmd_flags;
- unsigned int rq_flags;
+ req_flags_t rq_flags;
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cef1f713370b..cd75b0f73dc6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -791,16 +791,6 @@ static void blk_release_queue(struct kobject *kobj)
blk_free_queue_stats(q->stats);
- if (queue_is_mq(q)) {
- struct blk_mq_hw_ctx *hctx;
- int i;
-
- cancel_delayed_work_sync(&q->requeue_work);
-
- queue_for_each_hw_ctx(q, hctx, i)
- cancel_delayed_work_sync(&hctx->run_work);
- }
-
blk_exit_queue(q);
blk_queue_free_zone_bitmaps(q);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 1d0c76c18fc5..774ecc598bee 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -429,9 +429,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
op = REQ_OP_ZONE_RESET;
/* Invalidate the page cache, including dirty pages. */
+ filemap_invalidate_lock(bdev->bd_inode->i_mapping);
ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
if (ret)
- return ret;
+ goto fail;
break;
case BLKOPENZONE:
op = REQ_OP_ZONE_OPEN;
@@ -449,15 +450,9 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
GFP_KERNEL);
- /*
- * Invalidate the page cache again for zone reset: writes can only be
- * direct for zoned devices so concurrent writes would not add any page
- * to the page cache after/during reset. The page cache may be filled
- * again due to concurrent reads though and dropping the pages for
- * these is fine.
- */
- if (!ret && cmd == BLKRESETZONE)
- ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
+fail:
+ if (cmd == BLKRESETZONE)
+ filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
return ret;
}
diff --git a/block/blk.h b/block/blk.h
index b4fed2033e48..ccde6e6f1736 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -271,7 +271,7 @@ void __blk_account_io_done(struct request *req, u64 now);
*/
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
-bool blk_insert_flush(struct request *rq);
+void blk_insert_flush(struct request *rq);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
diff --git a/block/elevator.c b/block/elevator.c
index 1f39f6e8ebb9..19a78d5516ba 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -694,12 +694,18 @@ void elevator_init_mq(struct request_queue *q)
if (!e)
return;
+ /*
+ * We are called before adding disk, when there isn't any FS I/O,
+ * so freezing queue plus canceling dispatch work is enough to
+ * drain any dispatch activities originated from passthrough
+ * requests, then no need to quiesce queue which may add long boot
+ * latency, especially when lots of disks are involved.
+ */
blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
+ blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
- blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
if (err) {
diff --git a/block/genhd.c b/block/genhd.c
index ca2fbab1d425..30362aeacac4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -394,8 +394,8 @@ static void disk_scan_partitions(struct gendisk *disk)
* This function registers the partitioning information in @disk
* with the kernel.
*/
-int device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups)
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups)
{
struct device *ddev = disk_to_dev(disk);
@@ -544,7 +544,7 @@ out_disk_release_events:
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
- return WARN_ON_ONCE(ret); /* keep until all callers handle errors */
+ return ret;
}
EXPORT_SYMBOL(device_add_disk);
@@ -1111,6 +1111,8 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
+ blk_mq_cancel_work_sync(disk->queue);
+
disk_release_events(disk);
kfree(disk->random);
xa_destroy(&disk->part_tbl);
diff --git a/block/ioctl.c b/block/ioctl.c
index d6af0ac97e57..0a1d10ac2e1a 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -113,6 +113,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
uint64_t range[2];
uint64_t start, len;
struct request_queue *q = bdev_get_queue(bdev);
+ struct inode *inode = bdev->bd_inode;
int err;
if (!(mode & FMODE_WRITE))
@@ -135,12 +136,17 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
if (start + len > bdev_nr_bytes(bdev))
return -EINVAL;
+ filemap_invalidate_lock(inode->i_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
- return err;
+ goto fail;
- return blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, flags);
+ err = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, flags);
+
+fail:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
}
static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
@@ -148,6 +154,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
{
uint64_t range[2];
uint64_t start, end, len;
+ struct inode *inode = bdev->bd_inode;
int err;
if (!(mode & FMODE_WRITE))
@@ -170,12 +177,17 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
+ filemap_invalidate_lock(inode->i_mapping);
err = truncate_bdev_range(bdev, mode, start, end);
if (err)
- return err;
+ goto fail;
+
+ err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ BLKDEV_ZERO_NOUNMAP);
- return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+fail:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
}
static int put_ushort(unsigned short __user *argp, unsigned short val)
diff --git a/block/ioprio.c b/block/ioprio.c
index 0e4ff245f2bf..313c14a70bbd 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -69,7 +69,14 @@ int ioprio_check_cap(int ioprio)
switch (class) {
case IOPRIO_CLASS_RT:
- if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN))
+ /*
+ * Originally this only checked for CAP_SYS_ADMIN,
+ * which was implicitly allowed for pid 0 by security
+ * modules such as SELinux. Make sure we check
+ * CAP_SYS_ADMIN first to avoid a denial/avc for
+ * possibly missing CAP_SYS_NICE permission.
+ */
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
fallthrough;
/* rt has prio field too */