summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bfq-iosched.c66
-rw-r--r--block/bfq-iosched.h13
-rw-r--r--block/bio-integrity-auto.c4
-rw-r--r--block/bio-integrity.c3
-rw-r--r--block/bio.c24
-rw-r--r--block/blk-integrity.c70
-rw-r--r--block/blk-ioc.c16
-rw-r--r--block/blk-mq-cpumap.c46
-rw-r--r--block/blk-mq-debugfs.c12
-rw-r--r--block/blk-mq-dma.c161
-rw-r--r--block/blk-mq-sched.c223
-rw-r--r--block/blk-mq-sched.h12
-rw-r--r--block/blk-mq.c102
-rw-r--r--block/blk-settings.c159
-rw-r--r--block/blk-sysfs.c27
-rw-r--r--block/blk-zoned.c43
-rw-r--r--block/blk.h44
-rw-r--r--block/elevator.c65
-rw-r--r--block/elevator.h16
-rw-r--r--block/fops.c108
-rw-r--r--block/ioctl.c3
-rw-r--r--block/kyber-iosched.c20
-rw-r--r--block/mq-deadline.c30
-rw-r--r--block/t10-pi.c16
24 files changed, 927 insertions, 356 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0cb1e9873aab..3bf76902f07f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -454,17 +454,10 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
*/
static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
{
- struct bfq_io_cq *icq;
- unsigned long flags;
-
if (!current->io_context)
return NULL;
- spin_lock_irqsave(&q->queue_lock, flags);
- icq = icq_to_bic(ioc_lookup_icq(q));
- spin_unlock_irqrestore(&q->queue_lock, flags);
-
- return icq;
+ return icq_to_bic(ioc_lookup_icq(q));
}
/*
@@ -701,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
- int depth;
- unsigned limit = data->q->nr_requests;
- unsigned int act_idx;
+ unsigned int limit, act_idx;
/* Sync reads have full depth available */
- if (op_is_sync(opf) && !op_is_write(opf)) {
- depth = 0;
- } else {
- depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
- limit = (limit * depth) >> bfqd->full_depth_shift;
- }
+ if (op_is_sync(opf) && !op_is_write(opf))
+ limit = data->q->nr_requests;
+ else
+ limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
/* Fast path to check if bfqq is already allocated. */
@@ -725,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* available requests and thus starve other entities.
*/
if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
- depth = 1;
+ limit = 1;
break;
}
}
+
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
- if (depth)
- data->shallow_depth = depth;
+ __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);
+
+ if (limit < data->q->nr_requests)
+ data->shallow_depth = limit;
}
static struct bfq_queue *
@@ -2457,15 +2448,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
- struct request *free = NULL;
- /*
- * bfq_bic_lookup grabs the queue_lock: invoke it now and
- * store its return value for later use, to avoid nesting
- * queue_lock inside the bfqd->lock. We assume that the bic
- * returned by bfq_bic_lookup does not go away before
- * bfqd->lock is taken.
- */
struct bfq_io_cq *bic = bfq_bic_lookup(q);
+ struct request *free = NULL;
bool ret;
spin_lock_irq(&bfqd->lock);
@@ -7128,9 +7112,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
*/
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
- unsigned int depth = 1U << bt->sb.shift;
+ unsigned int nr_requests = bfqd->queue->nr_requests;
- bfqd->full_depth_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
@@ -7142,13 +7125,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max(depth >> 1, 1U);
+ bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
- bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
+ bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
@@ -7158,9 +7141,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* shortage.
*/
/* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
+ bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
+ bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
}
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
@@ -7232,22 +7215,16 @@ static void bfq_init_root_group(struct bfq_group *root_group,
root_group->sched_data.bfq_class_idle_last_service = jiffies;
}
-static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
{
struct bfq_data *bfqd;
- struct elevator_queue *eq;
unsigned int i;
struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
- eq = elevator_alloc(q, e);
- if (!eq)
- return -ENOMEM;
-
bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
- if (!bfqd) {
- kobject_put(&eq->kobj);
+ if (!bfqd)
return -ENOMEM;
- }
+
eq->elevator_data = bfqd;
spin_lock_irq(&q->queue_lock);
@@ -7405,7 +7382,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
out_free:
kfree(bfqd);
- kobject_put(&eq->kobj);
return -ENOMEM;
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 687a3a7ba784..34a498e6b2a5 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -427,9 +427,6 @@ struct bfq_iocq_bfqq_data {
*/
bool saved_IO_bound;
- u64 saved_io_start_time;
- u64 saved_tot_idle_time;
-
/*
* Same purpose as the previous fields for the values of the
* field keeping the queue's belonging to a large burst
@@ -450,6 +447,9 @@ struct bfq_iocq_bfqq_data {
*/
unsigned int saved_weight;
+ u64 saved_io_start_time;
+ u64 saved_tot_idle_time;
+
/*
* Similar to previous fields: save wr information.
*/
@@ -457,13 +457,13 @@ struct bfq_iocq_bfqq_data {
unsigned long saved_last_wr_start_finish;
unsigned long saved_service_from_wr;
unsigned long saved_wr_start_at_switch_to_srt;
- unsigned int saved_wr_cur_max_time;
struct bfq_ttime saved_ttime;
+ unsigned int saved_wr_cur_max_time;
/* Save also injection state */
- u64 saved_last_serv_time_ns;
unsigned int saved_inject_limit;
unsigned long saved_decrease_time_jif;
+ u64 saved_last_serv_time_ns;
/* candidate queue for a stable merge (due to close creation time) */
struct bfq_queue *stable_merge_bfqq;
@@ -813,8 +813,7 @@ struct bfq_data {
* Depth limits used in bfq_limit_depth (see comments on the
* function)
*/
- unsigned int word_depths[2][2];
- unsigned int full_depth_shift;
+ unsigned int async_depths[2][2];
/*
* Number of independent actuators. This is equal to 1 in
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
index 9c6657664792..687952f63bbb 100644
--- a/block/bio-integrity-auto.c
+++ b/block/bio-integrity-auto.c
@@ -54,10 +54,10 @@ static bool bi_offload_capable(struct blk_integrity *bi)
{
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
- return bi->tuple_size == sizeof(struct crc64_pi_tuple);
+ return bi->metadata_size == sizeof(struct crc64_pi_tuple);
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
- return bi->tuple_size == sizeof(struct t10_pi_tuple);
+ return bi->metadata_size == sizeof(struct t10_pi_tuple);
default:
pr_warn_once("%s: unknown integrity checksum type:%d\n",
__func__, bi->csum_type);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 10912988c8f5..6b077ca937f6 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -128,6 +128,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
if (bip->bip_vcnt > 0) {
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return 0;
+
if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
bip->bip_iter.bi_size += len;
return len;
diff --git a/block/bio.c b/block/bio.c
index 3c0a558c90f5..3b371a5da159 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -653,13 +653,13 @@ static void bio_truncate(struct bio *bio, unsigned new_size)
bio_for_each_segment(bv, bio, iter) {
if (done + bv.bv_len > new_size) {
- unsigned offset;
+ size_t offset;
if (!truncated)
offset = new_size - done;
else
offset = 0;
- zero_user(bv.bv_page, bv.bv_offset + offset,
+ memzero_page(bv.bv_page, bv.bv_offset + offset,
bv.bv_len - offset);
truncated = true;
}
@@ -930,8 +930,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
- if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
- return false;
if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
@@ -982,6 +980,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
WARN_ON_ONCE(bio_full(bio, len));
+ if (is_pci_p2pdma_page(page))
+ bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
@@ -1022,11 +1023,16 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_iter.bi_size > UINT_MAX - len)
return 0;
- if (bio->bi_vcnt > 0 &&
- bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset)) {
- bio->bi_iter.bi_size += len;
- return len;
+ if (bio->bi_vcnt > 0) {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return 0;
+
+ if (bvec_try_merge_page(bv, page, len, offset)) {
+ bio->bi_iter.bi_size += len;
+ return len;
+ }
}
if (bio->bi_vcnt >= bio->bi_max_vecs)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index e4e2567061f9..056b8948369d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -13,6 +13,7 @@
#include <linux/scatterlist.h>
#include <linux/export.h>
#include <linux/slab.h>
+#include <linux/t10-pi.h>
#include "blk.h"
@@ -54,6 +55,73 @@ new_segment:
return segments;
}
+int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
+ struct logical_block_metadata_cap __user *argp)
+{
+ struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+ struct logical_block_metadata_cap meta_cap = {};
+ size_t usize = _IOC_SIZE(cmd);
+
+ if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) ||
+ _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) ||
+ _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) ||
+ _IOC_SIZE(cmd) < LBMD_SIZE_VER0)
+ return -ENOIOCTLCMD;
+
+ if (!bi)
+ goto out;
+
+ if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)
+ meta_cap.lbmd_flags |= LBMD_PI_CAP_INTEGRITY;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ meta_cap.lbmd_flags |= LBMD_PI_CAP_REFTAG;
+ meta_cap.lbmd_interval = 1 << bi->interval_exp;
+ meta_cap.lbmd_size = bi->metadata_size;
+ meta_cap.lbmd_pi_size = bi->pi_tuple_size;
+ meta_cap.lbmd_pi_offset = bi->pi_offset;
+ meta_cap.lbmd_opaque_size = bi->metadata_size - bi->pi_tuple_size;
+ if (meta_cap.lbmd_opaque_size && !bi->pi_offset)
+ meta_cap.lbmd_opaque_offset = bi->pi_tuple_size;
+
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_NONE:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_NONE;
+ break;
+ case BLK_INTEGRITY_CSUM_IP:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_IP;
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC16_T10DIF;
+ break;
+ case BLK_INTEGRITY_CSUM_CRC64:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC64_NVME;
+ break;
+ }
+
+ if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE)
+ meta_cap.lbmd_app_tag_size = 2;
+
+ if (bi->flags & BLK_INTEGRITY_REF_TAG) {
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ meta_cap.lbmd_ref_tag_size =
+ sizeof_field(struct crc64_pi_tuple, ref_tag);
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ meta_cap.lbmd_ref_tag_size =
+ sizeof_field(struct t10_pi_tuple, ref_tag);
+ break;
+ default:
+ break;
+ }
+ }
+
+out:
+ return copy_struct_to_user(argp, usize, &meta_cap, sizeof(meta_cap),
+ NULL);
+}
+
/**
* blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
* @rq: request to map
@@ -239,7 +307,7 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr,
{
struct blk_integrity *bi = dev_to_bi(dev);
- if (!bi->tuple_size)
+ if (!bi->metadata_size)
return sysfs_emit(page, "none\n");
return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi));
}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index ce82770c72ab..9fda3906e5f5 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -308,24 +308,23 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
#ifdef CONFIG_BLK_ICQ
/**
- * ioc_lookup_icq - lookup io_cq from ioc
+ * ioc_lookup_icq - lookup io_cq from ioc in io issue path
* @q: the associated request_queue
*
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
- * with @q->queue_lock held.
+ * from io issue path, either return NULL if current issue io to @q for the
+ * first time, or return a valid icq.
*/
struct io_cq *ioc_lookup_icq(struct request_queue *q)
{
struct io_context *ioc = current->io_context;
struct io_cq *icq;
- lockdep_assert_held(&q->queue_lock);
-
/*
* icq's are indexed from @ioc using radix tree and hint pointer,
- * both of which are protected with RCU. All removals are done
- * holding both q and ioc locks, and we're holding q lock - if we
- * find a icq which points to us, it's guaranteed to be valid.
+ * both of which are protected with RCU, io issue path ensures that
+ * both request_queue and current task are valid, the found icq
+ * is guaranteed to be valid until the io is done.
*/
rcu_read_lock();
icq = rcu_dereference(ioc->icq_hint);
@@ -419,10 +418,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
task_unlock(current);
} else {
get_io_context(ioc);
-
- spin_lock_irq(&q->queue_lock);
icq = ioc_lookup_icq(q);
- spin_unlock_irq(&q->queue_lock);
}
if (!icq) {
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 444798c5374f..705da074ad6c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -12,16 +12,56 @@
#include <linux/cpu.h>
#include <linux/group_cpus.h>
#include <linux/device/bus.h>
+#include <linux/sched/isolation.h>
#include "blk.h"
#include "blk-mq.h"
+static unsigned int blk_mq_num_queues(const struct cpumask *mask,
+ unsigned int max_queues)
+{
+ unsigned int num;
+
+ num = cpumask_weight(mask);
+ return min_not_zero(num, max_queues);
+}
+
+/**
+ * blk_mq_num_possible_queues - Calc nr of queues for multiqueue devices
+ * @max_queues: The maximum number of queues the hardware/driver
+ * supports. If max_queues is 0, the argument is
+ * ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of possible CPUs.
+ */
+unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
+{
+ return blk_mq_num_queues(cpu_possible_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
+
+/**
+ * blk_mq_num_online_queues - Calc nr of queues for multiqueue devices
+ * @max_queues: The maximum number of queues the hardware/driver
+ * supports. If max_queues is 0, the argument is
+ * ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of online CPUs.
+ */
+unsigned int blk_mq_num_online_queues(unsigned int max_queues)
+{
+ return blk_mq_num_queues(cpu_online_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
+
void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
const struct cpumask *masks;
- unsigned int queue, cpu;
+ unsigned int queue, cpu, nr_masks;
- masks = group_cpus_evenly(qmap->nr_queues);
+ masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
if (!masks) {
for_each_possible_cpu(cpu)
qmap->mq_map[cpu] = qmap->queue_offset;
@@ -29,7 +69,7 @@ void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
}
for (queue = 0; queue < qmap->nr_queues; queue++) {
- for_each_cpu(cpu, &masks[queue])
+ for_each_cpu(cpu, &masks[queue % nr_masks])
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
kfree(masks);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 29b3540dd180..7ed3e71f2fc0 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -521,7 +521,7 @@ CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
- void *data = d_inode(m->file->f_path.dentry->d_parent)->i_private;
+ void *data = debugfs_get_aux(m->file);
return attr->show(data, m);
}
@@ -531,7 +531,7 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
{
struct seq_file *m = file->private_data;
const struct blk_mq_debugfs_attr *attr = m->private;
- void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
+ void *data = debugfs_get_aux(file);
/*
* Attributes that only implement .seq_ops are read-only and 'attr' is
@@ -546,7 +546,7 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
static int blk_mq_debugfs_open(struct inode *inode, struct file *file)
{
const struct blk_mq_debugfs_attr *attr = inode->i_private;
- void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
+ void *data = debugfs_get_aux(file);
struct seq_file *m;
int ret;
@@ -612,11 +612,9 @@ static void debugfs_create_files(struct dentry *parent, void *data,
if (IS_ERR_OR_NULL(parent))
return;
- d_inode(parent)->i_private = data;
-
for (; attr->name; attr++)
- debugfs_create_file(attr->name, attr->mode, parent,
- (void *)attr, &blk_mq_debugfs_fops);
+ debugfs_create_file_aux(attr->name, attr->mode, parent,
+ (void *)attr, data, &blk_mq_debugfs_fops);
}
void blk_mq_debugfs_register(struct request_queue *q)
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 82bae475dfa4..ad283017caef 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2025 Christoph Hellwig
*/
+#include <linux/blk-mq-dma.h>
#include "blk.h"
struct phys_vec {
@@ -61,6 +62,166 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
return true;
}
+/*
+ * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
+ * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
+ * we need to ensure our segments are aligned to this as well.
+ *
+ * Note that there is no point in using the slightly more complicated IOVA based
+ * path for single segment mappings.
+ */
+static inline bool blk_can_dma_map_iova(struct request *req,
+ struct device *dma_dev)
+{
+ return !((queue_virt_boundary(req->q) + 1) &
+ dma_get_merge_boundary(dma_dev));
+}
+
+static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+ iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
+ iter->len = vec->len;
+ return true;
+}
+
+static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
+ struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+ iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
+ offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
+ if (dma_mapping_error(dma_dev, iter->addr)) {
+ iter->status = BLK_STS_RESOURCE;
+ return false;
+ }
+ iter->len = vec->len;
+ return true;
+}
+
+static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter,
+ struct phys_vec *vec)
+{
+ enum dma_data_direction dir = rq_dma_dir(req);
+ unsigned int mapped = 0;
+ int error;
+
+ iter->addr = state->addr;
+ iter->len = dma_iova_size(state);
+
+ do {
+ error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
+ vec->len, dir, 0);
+ if (error)
+ break;
+ mapped += vec->len;
+ } while (blk_map_iter_next(req, &iter->iter, vec));
+
+ error = dma_iova_sync(dma_dev, state, 0, mapped);
+ if (error) {
+ iter->status = errno_to_blk_status(error);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * blk_rq_dma_map_iter_start - map the first DMA segment for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the
+ * caller and don't need to be initialized. @state needs to be stored for use
+ * at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+ unsigned int total_len = blk_rq_payload_bytes(req);
+ struct phys_vec vec;
+
+ iter->iter.bio = req->bio;
+ iter->iter.iter = req->bio->bi_iter;
+ memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
+ iter->status = BLK_STS_OK;
+
+ /*
+ * Grab the first segment ASAP because we'll need it to check for P2P
+ * transfers.
+ */
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
+ switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+ phys_to_page(vec.paddr))) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ return blk_dma_map_bus(iter, &vec);
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * P2P transfers through the host bridge are treated the
+ * same as non-P2P transfers below and during unmap.
+ */
+ req->cmd_flags &= ~REQ_P2PDMA;
+ break;
+ default:
+ iter->status = BLK_STS_INVAL;
+ return false;
+ }
+ }
+
+ if (blk_can_dma_map_iova(req, dma_dev) &&
+ dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
+ return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
+
+/**
+ * blk_rq_dma_map_iter_next - map the next DMA segment for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Iterate to the next mapping after a previous call to
+ * blk_rq_dma_map_iter_start(). See there for a detailed description of the
+ * arguments.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ */
+bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+ struct phys_vec vec;
+
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+ return blk_dma_map_bus(iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
+
static inline struct scatterlist *
blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
{
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 55a0fd105147..e2ce4a28e6c9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
-{
- if (blk_mq_is_shared_tags(q->tag_set->flags)) {
- hctx->sched_tags = q->sched_shared_tags;
- return 0;
- }
-
- hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
- q->nr_requests);
-
- if (!hctx->sched_tags)
- return -ENOMEM;
- return 0;
-}
-
-static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
-{
- blk_mq_free_rq_map(queue->sched_shared_tags);
- queue->sched_shared_tags = NULL;
-}
-
/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags) {
- if (!blk_mq_is_shared_tags(flags))
- blk_mq_free_rq_map(hctx->sched_tags);
- hctx->sched_tags = NULL;
- }
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = NULL;
if (blk_mq_is_shared_tags(flags))
- blk_mq_exit_sched_shared_tags(q);
-}
-
-static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
-{
- struct blk_mq_tag_set *set = queue->tag_set;
-
- /*
- * Set initial depth at max so that we don't need to reallocate for
- * updating nr_requests.
- */
- queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
- BLK_MQ_NO_HCTX_IDX,
- MAX_SCHED_RQ);
- if (!queue->sched_shared_tags)
- return -ENOMEM;
-
- blk_mq_tag_update_sched_shared_tags(queue);
-
- return 0;
+ q->sched_shared_tags = NULL;
}
void blk_mq_sched_reg_debugfs(struct request_queue *q)
@@ -458,8 +411,140 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
mutex_unlock(&q->debugfs_mutex);
}
+void blk_mq_free_sched_tags(struct elevator_tags *et,
+ struct blk_mq_tag_set *set)
+{
+ unsigned long i;
+
+ /* Shared tags are stored at index 0 in @tags. */
+ if (blk_mq_is_shared_tags(set->flags))
+ blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
+ else {
+ for (i = 0; i < et->nr_hw_queues; i++)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
+ }
+
+ kfree(et);
+}
+
+void blk_mq_free_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+ struct elevator_tags *et;
+
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
+ */
+ if (q->elevator) {
+ et = xa_load(et_table, q->id);
+ if (unlikely(!et))
+ WARN_ON_ONCE(1);
+ else
+ blk_mq_free_sched_tags(et, set);
+ }
+ }
+}
+
+struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues)
+{
+ unsigned int nr_tags;
+ int i;
+ struct elevator_tags *et;
+ gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+ if (blk_mq_is_shared_tags(set->flags))
+ nr_tags = 1;
+ else
+ nr_tags = nr_hw_queues;
+
+ et = kmalloc(sizeof(struct elevator_tags) +
+ nr_tags * sizeof(struct blk_mq_tags *), gfp);
+ if (!et)
+ return NULL;
+ /*
+ * Default to double of smaller one between hw queue_depth and
+ * 128, since we don't split into sync/async like the old code
+ * did. Additionally, this is a per-hw queue depth.
+ */
+ et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
+ BLKDEV_DEFAULT_RQ);
+ et->nr_hw_queues = nr_hw_queues;
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+ /* Shared tags are stored at index 0 in @tags. */
+ et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
+ MAX_SCHED_RQ);
+ if (!et->tags[0])
+ goto out;
+ } else {
+ for (i = 0; i < et->nr_hw_queues; i++) {
+ et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
+ et->nr_requests);
+ if (!et->tags[i])
+ goto out_unwind;
+ }
+ }
+
+ return et;
+out_unwind:
+ while (--i >= 0)
+ blk_mq_free_map_and_rqs(set, et->tags[i], i);
+out:
+ kfree(et);
+ return NULL;
+}
+
+int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
+{
+ struct request_queue *q;
+ struct elevator_tags *et;
+ gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is
+ * safe because we're holding here set->update_nr_hwq_lock in
+ * the writer context. So, scheduler update/switch code (which
+ * acquires the same lock but in the reader context) can't run
+ * concurrently.
+ */
+ if (q->elevator) {
+ et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
+ if (!et)
+ goto out_unwind;
+ if (xa_insert(et_table, q->id, et, gfp))
+ goto out_free_tags;
+ }
+ }
+ return 0;
+out_free_tags:
+ blk_mq_free_sched_tags(et, set);
+out_unwind:
+ list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
+ if (q->elevator) {
+ et = xa_load(et_table, q->id);
+ if (et)
+ blk_mq_free_sched_tags(et, set);
+ }
+ }
+ return -ENOMEM;
+}
+
/* caller must have a reference to @e, will grab another one if successful */
-int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *et)
{
unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
@@ -467,36 +552,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
unsigned long i;
int ret;
- /*
- * Default to double of smaller one between hw queue_depth and 128,
- * since we don't split into sync/async like the old code did.
- * Additionally, this is a per-hw queue depth.
- */
- q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
- BLKDEV_DEFAULT_RQ);
+ eq = elevator_alloc(q, e, et);
+ if (!eq)
+ return -ENOMEM;
+
+ q->nr_requests = et->nr_requests;
if (blk_mq_is_shared_tags(flags)) {
- ret = blk_mq_init_sched_shared_tags(q);
- if (ret)
- return ret;
+ /* Shared tags are stored at index 0 in @et->tags. */
+ q->sched_shared_tags = et->tags[0];
+ blk_mq_tag_update_sched_shared_tags(q);
}
queue_for_each_hw_ctx(q, hctx, i) {
- ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
- if (ret)
- goto err_free_map_and_rqs;
+ if (blk_mq_is_shared_tags(flags))
+ hctx->sched_tags = q->sched_shared_tags;
+ else
+ hctx->sched_tags = et->tags[i];
}
- ret = e->ops.init_sched(q, e);
+ ret = e->ops.init_sched(q, eq);
if (ret)
- goto err_free_map_and_rqs;
+ goto out;
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
if (ret) {
- eq = q->elevator;
- blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
@@ -505,10 +587,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
}
return 0;
-err_free_map_and_rqs:
- blk_mq_sched_free_rqs(q);
+out:
blk_mq_sched_tags_teardown(q, flags);
-
+ kobject_put(&eq->kobj);
q->elevator = NULL;
return ret;
}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 1326526bb733..b554e1d55950 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
-int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *et);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);
+struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues);
+int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
+void blk_mq_free_sched_tags(struct elevator_tags *et,
+ struct blk_mq_tag_set *set);
+void blk_mq_free_sched_tags_batch(struct xarray *et_table,
+ struct blk_mq_tag_set *set);
+
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4806b867e37d..b67d6c02eceb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -883,7 +883,8 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -982,7 +983,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
/* Don't actually finish bio if it's part of flush sequence */
if (!bio->bi_iter.bi_size) {
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
}
@@ -3169,8 +3171,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
- if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
- goto queue_exit;
+ if (bio_needs_zone_write_plugging(bio)) {
+ if (blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+ }
new_request:
if (rq) {
@@ -4966,6 +4970,61 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
+/*
+ * Switch back to the elevator type stored in the xarray.
+ */
+static void blk_mq_elv_switch_back(struct request_queue *q,
+ struct xarray *elv_tbl, struct xarray *et_tbl)
+{
+ struct elevator_type *e = xa_load(elv_tbl, q->id);
+ struct elevator_tags *t = xa_load(et_tbl, q->id);
+
+ /* The elv_update_nr_hw_queues unfreezes the queue. */
+ elv_update_nr_hw_queues(q, e, t);
+
+ /* Drop the reference acquired in blk_mq_elv_switch_none. */
+ if (e)
+ elevator_put(e);
+}
+
+/*
+ * Stores elevator type in xarray and set current elevator to none. It uses
+ * q->id as an index to store the elevator type into the xarray.
+ */
+static int blk_mq_elv_switch_none(struct request_queue *q,
+ struct xarray *elv_tbl)
+{
+ int ret = 0;
+
+ lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);
+
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe here
+ * because we're called from nr_hw_queue update which is protected by
+ * set->update_nr_hwq_lock in the writer context. So, scheduler update/
+ * switch code (which acquires the same lock in the reader context)
+ * can't run concurrently.
+ */
+ if (q->elevator) {
+
+ ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(ret))
+ return ret;
+
+ /*
+ * Before we switch elevator to 'none', take a reference to
+ * the elevator module so that while nr_hw_queue update is
+ * running, no one can remove elevator module. We'd put the
+ * reference to elevator module later when we switch back
+ * elevator.
+ */
+ __elevator_get(q->elevator->type);
+
+ elevator_set_none(q);
+ }
+ return ret;
+}
+
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
@@ -4973,6 +5032,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
+ struct xarray elv_tbl, et_tbl;
lockdep_assert_held(&set->tag_list_lock);
@@ -4984,6 +5044,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
+
+ xa_init(&et_tbl);
+ if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
+ goto out_memalloc_restore;
+
+ xa_init(&elv_tbl);
+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
@@ -4992,11 +5059,17 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue_nomemsave(q);
- if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue_nomemrestore(q);
- goto reregister;
- }
+ /*
+ * Switch IO scheduler to 'none', cleaning up the data associated
+ * with the previous scheduler. We will switch back once we are done
+ * updating the new sw to hw queue mappings.
+ */
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ if (blk_mq_elv_switch_none(q, &elv_tbl))
+ goto switch_back;
+
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
+ goto switch_back;
fallback:
blk_mq_update_queue_map(set);
@@ -5016,12 +5089,11 @@ fallback:
}
blk_mq_map_swqueue(q);
}
-
- /* elv_update_nr_hw_queues() unfreeze queue for us */
+switch_back:
+ /* The blk_mq_elv_switch_back unfreezes queue for us. */
list_for_each_entry(q, &set->tag_list, tag_set_list)
- elv_update_nr_hw_queues(q);
+ blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
-reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
@@ -5029,6 +5101,10 @@ reregister:
blk_mq_remove_hw_queues_cpuhp(q);
blk_mq_add_hw_queues_cpuhp(q);
}
+
+ xa_destroy(&elv_tbl);
+ xa_destroy(&et_tbl);
+out_memalloc_restore:
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a000daafbfb4..07874e9b609f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -14,6 +14,8 @@
#include <linux/jiffies.h>
#include <linux/gfp.h>
#include <linux/dma-mapping.h>
+#include <linux/t10-pi.h>
+#include <linux/crc64.h>
#include "blk.h"
#include "blk-rq-qos.h"
@@ -50,6 +52,8 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
+ lim->max_hw_wzeroes_unmap_sectors = UINT_MAX;
+ lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
}
@@ -58,16 +62,24 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
void blk_apply_bdi_limits(struct backing_dev_info *bdi,
struct queue_limits *lim)
{
+ u64 io_opt = lim->io_opt;
+
/*
* For read-ahead of large files to be effective, we need to read ahead
- * at least twice the optimal I/O size.
+ * at least twice the optimal I/O size. For rotational devices that do
+ * not report an optimal I/O size (e.g. ATA HDDs), use the maximum I/O
+ * size to avoid falling back to the (rather inefficient) small default
+ * read-ahead size.
*
* There is no hardware limitation for the read-ahead size and the user
* might have increased the read-ahead size through sysfs, so don't ever
* decrease it.
*/
+ if (!io_opt && (lim->features & BLK_FEAT_ROTATIONAL))
+ io_opt = (u64)lim->max_sectors << SECTOR_SHIFT;
+
bdi->ra_pages = max3(bdi->ra_pages,
- lim->io_opt * 2 / PAGE_SIZE,
+ io_opt * 2 >> PAGE_SHIFT,
VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
@@ -114,7 +126,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
{
struct blk_integrity *bi = &lim->integrity;
- if (!bi->tuple_size) {
+ if (!bi->metadata_size) {
if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE ||
bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) {
pr_warn("invalid PI settings.\n");
@@ -135,6 +147,42 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
return -EINVAL;
}
+ if (bi->pi_tuple_size > bi->metadata_size) {
+ pr_warn("pi_tuple_size (%u) exceeds metadata_size (%u)\n",
+ bi->pi_tuple_size,
+ bi->metadata_size);
+ return -EINVAL;
+ }
+
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_NONE:
+ if (bi->pi_tuple_size) {
+ pr_warn("pi_tuple_size must be 0 when checksum type \
+ is none\n");
+ return -EINVAL;
+ }
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) {
+ pr_warn("pi_tuple_size mismatch for T10 PI: expected \
+ %zu, got %u\n",
+ sizeof(struct t10_pi_tuple),
+ bi->pi_tuple_size);
+ return -EINVAL;
+ }
+ break;
+ case BLK_INTEGRITY_CSUM_CRC64:
+ if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) {
+ pr_warn("pi_tuple_size mismatch for CRC64 PI: \
+ expected %zu, got %u\n",
+ sizeof(struct crc64_pi_tuple),
+ bi->pi_tuple_size);
+ return -EINVAL;
+ }
+ break;
+ }
+
if (!bi->interval_exp)
bi->interval_exp = ilog2(lim->logical_block_size);
@@ -181,6 +229,8 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim)
static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
unsigned int boundary_sectors;
+ unsigned int atomic_write_hw_max_sectors =
+ lim->atomic_write_hw_max >> SECTOR_SHIFT;
if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
goto unsupported;
@@ -202,6 +252,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
lim->atomic_write_hw_max))
goto unsupported;
+ if (WARN_ON_ONCE(lim->chunk_sectors &&
+ atomic_write_hw_max_sectors > lim->chunk_sectors))
+ goto unsupported;
+
boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
if (boundary_sectors) {
@@ -266,8 +320,12 @@ int blk_validate_limits(struct queue_limits *lim)
pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size);
return -EINVAL;
}
- if (lim->physical_block_size < lim->logical_block_size)
+ if (lim->physical_block_size < lim->logical_block_size) {
lim->physical_block_size = lim->logical_block_size;
+ } else if (!is_power_of_2(lim->physical_block_size)) {
+ pr_warn("Invalid physical block size (%d)\n", lim->physical_block_size);
+ return -EINVAL;
+ }
/*
* The minimum I/O size defaults to the physical block size unless
@@ -333,15 +391,28 @@ int blk_validate_limits(struct queue_limits *lim)
if (!lim->max_segments)
lim->max_segments = BLK_MAX_SEGMENTS;
+ if (lim->max_hw_wzeroes_unmap_sectors &&
+ lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors)
+ return -EINVAL;
+ lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors,
+ lim->max_user_wzeroes_unmap_sectors);
+
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
+ /*
+ * When discard is not supported, discard_granularity should be reported
+ * as 0 to userspace.
+ */
+ if (lim->max_discard_sectors)
+ lim->discard_granularity =
+ max(lim->discard_granularity, lim->physical_block_size);
+ else
+ lim->discard_granularity = 0;
+
if (!lim->max_discard_segments)
lim->max_discard_segments = 1;
- if (lim->discard_granularity < lim->physical_block_size)
- lim->discard_granularity = lim->physical_block_size;
-
/*
* By default there is no limit on the segment boundary alignment,
* but if there is one it can't be smaller than the page size as
@@ -418,10 +489,11 @@ int blk_set_default_limits(struct queue_limits *lim)
{
/*
* Most defaults are set by capping the bounds in blk_validate_limits,
- * but max_user_discard_sectors is special and needs an explicit
- * initialization to the max value here.
+ * but these limits are special and need an explicit initialization to
+ * the max value here.
*/
lim->max_user_discard_sectors = UINT_MAX;
+ lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
return blk_validate_limits(lim);
}
@@ -589,41 +661,50 @@ static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
return true;
}
-
-/* Check stacking of first bottom device */
-static bool blk_stack_atomic_writes_head(struct queue_limits *t,
- struct queue_limits *b)
+static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
{
- if (b->atomic_write_hw_boundary &&
- !blk_stack_atomic_writes_boundary_head(t, b))
- return false;
+ unsigned int chunk_bytes;
- if (t->io_min <= SECTOR_SIZE) {
- /* No chunk sectors, so use bottom device values directly */
- t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
- t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
- t->atomic_write_hw_max = b->atomic_write_hw_max;
- return true;
- }
+ if (!t->chunk_sectors)
+ return;
+
+ /*
+ * If chunk sectors is so large that its value in bytes overflows
+ * UINT_MAX, then just shift it down so it definitely will fit.
+ * We don't support atomic writes of such a large size anyway.
+ */
+ if (check_shl_overflow(t->chunk_sectors, SECTOR_SHIFT, &chunk_bytes))
+ chunk_bytes = t->chunk_sectors;
/*
* Find values for limits which work for chunk size.
* b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
- * size (t->io_min), as chunk size is not restricted to a power-of-2.
+ * size, as the chunk size is not restricted to a power-of-2.
* So we need to find highest power-of-2 which works for the chunk
* size.
- * As an example scenario, we could have b->unit_max = 16K and
- * t->io_min = 24K. For this case, reduce t->unit_max to a value
- * aligned with both limits, i.e. 8K in this example.
+ * As an example scenario, we could have t->unit_max = 16K and
+ * t->chunk_sectors = 24KB. For this case, reduce t->unit_max to a
+ * value aligned with both limits, i.e. 8K in this example.
*/
- t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
- while (t->io_min % t->atomic_write_hw_unit_max)
- t->atomic_write_hw_unit_max /= 2;
+ t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
+ max_pow_of_two_factor(chunk_bytes));
- t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min,
+ t->atomic_write_hw_unit_min = min(t->atomic_write_hw_unit_min,
t->atomic_write_hw_unit_max);
- t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min);
+ t->atomic_write_hw_max = min(t->atomic_write_hw_max, chunk_bytes);
+}
+
+/* Check stacking of first bottom device */
+static bool blk_stack_atomic_writes_head(struct queue_limits *t,
+ struct queue_limits *b)
+{
+ if (b->atomic_write_hw_boundary &&
+ !blk_stack_atomic_writes_boundary_head(t, b))
+ return false;
+ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+ t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
+ t->atomic_write_hw_max = b->atomic_write_hw_max;
return true;
}
@@ -651,6 +732,7 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t,
if (!blk_stack_atomic_writes_head(t, b))
goto unsupported;
+ blk_stack_atomic_writes_chunk_sectors(t);
return;
unsupported:
@@ -708,6 +790,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
+ t->max_user_wzeroes_unmap_sectors =
+ min(t->max_user_wzeroes_unmap_sectors,
+ b->max_user_wzeroes_unmap_sectors);
+ t->max_hw_wzeroes_unmap_sectors =
+ min(t->max_hw_wzeroes_unmap_sectors,
+ b->max_hw_wzeroes_unmap_sectors);
+
t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
b->max_hw_zone_append_sectors);
@@ -779,7 +868,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
}
/* chunk_sectors a multiple of the physical block size? */
- if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
+ if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) {
t->chunk_sectors = 0;
t->flags |= BLK_FLAG_MISALIGNED;
ret = -1;
@@ -875,7 +964,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
return true;
if (ti->flags & BLK_INTEGRITY_STACKED) {
- if (ti->tuple_size != bi->tuple_size)
+ if (ti->metadata_size != bi->metadata_size)
goto incompatible;
if (ti->interval_exp != bi->interval_exp)
goto incompatible;
@@ -891,7 +980,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
(bi->flags & BLK_INTEGRITY_REF_TAG);
ti->csum_type = bi->csum_type;
- ti->tuple_size = bi->tuple_size;
+ ti->metadata_size = bi->metadata_size;
ti->pi_offset = bi->pi_offset;
ti->interval_exp = bi->interval_exp;
ti->tag_size = bi->tag_size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b2b9b89d6967..396cded255ea 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -161,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
@@ -205,6 +207,24 @@ static int queue_max_discard_sectors_store(struct gendisk *disk,
return 0;
}
+static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk,
+ const char *page, size_t count, struct queue_limits *lim)
+{
+ unsigned long max_zeroes_bytes, max_hw_zeroes_bytes;
+ ssize_t ret;
+
+ ret = queue_var_store(&max_zeroes_bytes, page, count);
+ if (ret < 0)
+ return ret;
+
+ max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT;
+ if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes)
+ return -EINVAL;
+
+ lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT;
+ return 0;
+}
+
static int
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
struct queue_limits *lim)
@@ -514,6 +534,10 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors,
+ "write_zeroes_unmap_max_hw_bytes");
+QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors,
+ "write_zeroes_unmap_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
@@ -662,6 +686,8 @@ static struct attribute *queue_attrs[] = {
&queue_atomic_write_unit_min_entry.attr,
&queue_atomic_write_unit_max_entry.attr,
&queue_max_write_zeroes_sectors_entry.attr,
+ &queue_max_hw_wzeroes_unmap_sectors_entry.attr,
+ &queue_max_wzeroes_unmap_sectors_entry.attr,
&queue_max_zone_append_sectors_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_rotational_entry.attr,
@@ -960,4 +986,5 @@ void blk_unregister_queue(struct gendisk *disk)
elevator_set_none(q);
blk_debugfs_remove(disk);
+ kobject_put(&disk->queue_kobj);
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 351d659280e1..ef43aaca49f4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -17,6 +17,8 @@
#include <linux/refcount.h>
#include <linux/mempool.h>
+#include <trace/events/block.h>
+
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-mq-debugfs.h"
@@ -177,6 +179,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev)
struct bio bio;
bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
+ trace_blkdev_zone_mgmt(&bio, 0);
return submit_bio_wait(&bio);
}
@@ -240,6 +243,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
cond_resched();
}
+ trace_blkdev_zone_mgmt(bio, nr_sectors);
ret = submit_bio_wait(bio);
bio_put(bio);
@@ -818,6 +822,8 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
* at the tail of the list to preserve the sequential write order.
*/
bio_list_add(&zwplug->bio_list, bio);
+ trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
+ bio->bi_iter.bi_sector, bio_sectors(bio));
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
@@ -1116,25 +1122,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
struct block_device *bdev = bio->bi_bdev;
- if (!bdev->bd_disk->zone_wplugs_hash)
- return false;
-
- /*
- * If the BIO already has the plugging flag set, then it was already
- * handled through this path and this is a submission from the zone
- * plug bio submit work.
- */
- if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
- return false;
-
- /*
- * We do not need to do anything special for empty flush BIOs, e.g
- * BIOs such as issued by blkdev_issue_flush(). The is because it is
- * the responsibility of the user to first wait for the completion of
- * write operations for flush to have any effect on the persistence of
- * the written data.
- */
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
return false;
/*
@@ -1205,6 +1193,20 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
spin_unlock_irqrestore(&zwplug->lock, flags);
}
+void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
+{
+ /*
+ * For zone append requests, the request sector indicates the location
+ * at which the BIO data was written. Return this value to the BIO
+ * issuer through the BIO iter sector.
+ * For plugged zone writes, which include emulated zone append, we need
+ * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+ * lookup the zone write plug.
+ */
+ bio->bi_iter.bi_sector = rq->__sector;
+ trace_blk_zone_append_update_request_bio(rq);
+}
+
void blk_zone_write_plug_bio_endio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
@@ -1299,6 +1301,9 @@ again:
goto put_zwplug;
}
+ trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
+ bio->bi_iter.bi_sector, bio_sectors(bio));
+
if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
blk_zone_wplug_bio_io_error(zwplug, bio);
goto again;
diff --git a/block/blk.h b/block/blk.h
index 37ec459fe656..0a2eccf28ca4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -12,6 +12,16 @@
#include "blk-crypto-internal.h"
struct elevator_type;
+struct elevator_tags;
+
+/*
+ * Default upper limit for the software max_sectors limit used for regular I/Os.
+ * This can be increased through sysfs.
+ *
+ * This should not be confused with the max_hw_sector limit that is entirely
+ * controlled by the block device driver, usually based on hardware limits.
+ */
+#define BLK_DEF_MAX_SECTORS_CAP (SZ_4M >> SECTOR_SHIFT)
#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
#define BLK_MIN_SEGMENT_SIZE 4096
@@ -321,7 +331,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-void elv_update_nr_hw_queues(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *t);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);
@@ -467,23 +478,15 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
-void blk_zone_write_plug_bio_merged(struct bio *bio);
-void blk_zone_write_plug_init_request(struct request *rq);
-static inline void blk_zone_update_request_bio(struct request *rq,
- struct bio *bio)
+static inline bool blk_req_bio_is_zone_append(struct request *rq,
+ struct bio *bio)
{
- /*
- * For zone append requests, the request sector indicates the location
- * at which the BIO data was written. Return this value to the BIO
- * issuer through the BIO iter sector.
- * For plugged zone writes, which include emulated zone append, we need
- * the original BIO sector so that blk_zone_write_plug_bio_endio() can
- * lookup the zone write plug.
- */
- if (req_op(rq) == REQ_OP_ZONE_APPEND ||
- bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
- bio->bi_iter.bi_sector = rq->__sector;
+ return req_op(rq) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_init_request(struct request *rq);
+void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio);
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
@@ -516,14 +519,19 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return false;
}
+static inline bool blk_req_bio_is_zone_append(struct request *req,
+ struct bio *bio)
+{
+ return false;
+}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
-static inline void blk_zone_update_request_bio(struct request *rq,
- struct bio *bio)
+static inline void blk_zone_append_update_request_bio(struct request *rq,
+ struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
diff --git a/block/elevator.c b/block/elevator.c
index ab22542e6cf0..fe96c6f4753c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -54,6 +54,8 @@ struct elv_change_ctx {
struct elevator_queue *old;
/* for registering new elevator */
struct elevator_queue *new;
+ /* holds sched tags data */
+ struct elevator_tags *et;
};
static DEFINE_SPINLOCK(elv_list_lock);
@@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name)
static const struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q,
- struct elevator_type *e)
+ struct elevator_type *e, struct elevator_tags *et)
{
struct elevator_queue *eq;
@@ -145,10 +147,10 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
hash_init(eq->hash);
+ eq->et = et;
return eq;
}
-EXPORT_SYMBOL(elevator_alloc);
static void elevator_release(struct kobject *kobj)
{
@@ -166,7 +168,6 @@ static void elevator_exit(struct request_queue *q)
lockdep_assert_held(&q->elevator_lock);
ioc_clear_queue(q);
- blk_mq_sched_free_rqs(q);
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
@@ -592,7 +593,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
}
if (new_e) {
- ret = blk_mq_init_sched(q, new_e);
+ ret = blk_mq_init_sched(q, new_e, ctx->et);
if (ret)
goto out_unfreeze;
ctx->new = q->elevator;
@@ -627,8 +628,10 @@ static void elv_exit_and_release(struct request_queue *q)
elevator_exit(q);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
- if (e)
+ if (e) {
+ blk_mq_free_sched_tags(e->et, q->tag_set);
kobject_put(&e->kobj);
+ }
}
static int elevator_change_done(struct request_queue *q,
@@ -641,6 +644,7 @@ static int elevator_change_done(struct request_queue *q,
&ctx->old->flags);
elv_unregister_queue(q, ctx->old);
+ blk_mq_free_sched_tags(ctx->old->et, q->tag_set);
kobject_put(&ctx->old->kobj);
if (enable_wbt)
wbt_enable_default(q->disk);
@@ -659,9 +663,16 @@ static int elevator_change_done(struct request_queue *q,
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
{
unsigned int memflags;
+ struct blk_mq_tag_set *set = q->tag_set;
int ret = 0;
- lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
+ lockdep_assert_held(&set->update_nr_hwq_lock);
+
+ if (strncmp(ctx->name, "none", 4)) {
+ ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
+ if (!ctx->et)
+ return -ENOMEM;
+ }
memflags = blk_mq_freeze_queue(q);
/*
@@ -681,6 +692,11 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
blk_mq_unfreeze_queue(q, memflags);
if (!ret)
ret = elevator_change_done(q, ctx);
+ /*
+ * Free sched tags if it's allocated but we couldn't switch elevator.
+ */
+ if (ctx->et && !ctx->new)
+ blk_mq_free_sched_tags(ctx->et, set);
return ret;
}
@@ -689,24 +705,32 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
* The I/O scheduler depends on the number of hardware queues, this forces a
* reattachment when nr_hw_queues changes.
*/
-void elv_update_nr_hw_queues(struct request_queue *q)
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *t)
{
+ struct blk_mq_tag_set *set = q->tag_set;
struct elv_change_ctx ctx = {};
int ret = -ENODEV;
WARN_ON_ONCE(q->mq_freeze_depth == 0);
- mutex_lock(&q->elevator_lock);
- if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
- ctx.name = q->elevator->type->elevator_name;
+ if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
+ ctx.name = e->elevator_name;
+ ctx.et = t;
+ mutex_lock(&q->elevator_lock);
/* force to reattach elevator after nr_hw_queue is updated */
ret = elevator_switch(q, &ctx);
+ mutex_unlock(&q->elevator_lock);
}
- mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue_nomemrestore(q);
if (!ret)
WARN_ON_ONCE(elevator_change_done(q, &ctx));
+ /*
+ * Free sched tags if it's allocated but we couldn't switch elevator.
+ */
+ if (t && !ctx.new)
+ blk_mq_free_sched_tags(t, set);
}
/*
@@ -719,7 +743,8 @@ void elevator_set_default(struct request_queue *q)
.name = "mq-deadline",
.no_uevent = true,
};
- int err = 0;
+ int err;
+ struct elevator_type *e;
/* now we allow to switch elevator */
blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
@@ -732,12 +757,18 @@ void elevator_set_default(struct request_queue *q)
* have multiple queues or mq-deadline is not available, default
* to "none".
*/
- if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
- blk_mq_is_shared_tags(q->tag_set->flags)))
+ e = elevator_find_get(ctx.name);
+ if (!e)
+ return;
+
+ if ((q->nr_hw_queues == 1 ||
+ blk_mq_is_shared_tags(q->tag_set->flags))) {
err = elevator_change(q, &ctx);
- if (err < 0)
- pr_warn("\"%s\" elevator initialization, failed %d, "
- "falling back to \"none\"\n", ctx.name, err);
+ if (err < 0)
+ pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
+ ctx.name, err);
+ }
+ elevator_put(e);
}
void elevator_set_none(struct request_queue *q)
diff --git a/block/elevator.h b/block/elevator.h
index a07ce773a38f..adc5c157e17e 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -23,8 +23,17 @@ enum elv_merge {
struct blk_mq_alloc_data;
struct blk_mq_hw_ctx;
+struct elevator_tags {
+ /* num. of hardware queues for which tags are allocated */
+ unsigned int nr_hw_queues;
+ /* depth used while allocating tags */
+ unsigned int nr_requests;
+ /* shared tag is stored at index 0 */
+ struct blk_mq_tags *tags[];
+};
+
struct elevator_mq_ops {
- int (*init_sched)(struct request_queue *, struct elevator_type *);
+ int (*init_sched)(struct request_queue *, struct elevator_queue *);
void (*exit_sched)(struct elevator_queue *);
int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
@@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
struct elevator_queue
{
struct elevator_type *type;
+ struct elevator_tags *et;
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
@@ -152,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page);
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
extern bool elv_bio_merge_ok(struct request *, struct bio *);
-extern struct elevator_queue *elevator_alloc(struct request_queue *,
- struct elevator_type *);
+struct elevator_queue *elevator_alloc(struct request_queue *,
+ struct elevator_type *, struct elevator_tags *);
/*
* Helper functions.
diff --git a/block/fops.c b/block/fops.c
index 1309861d4c2c..82451ac8ff25 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -496,18 +496,21 @@ static void blkdev_readahead(struct readahead_control *rac)
mpage_readahead(rac, blkdev_get_block);
}
-static int blkdev_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int blkdev_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
}
-static int blkdev_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct folio *folio,
- void *fsdata)
+static int blkdev_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = block_write_end(pos, len, copied, folio);
folio_unlock(folio);
folio_put(folio);
@@ -537,30 +540,42 @@ static void blkdev_readahead(struct readahead_control *rac)
iomap_readahead(rac, &blkdev_iomap_ops);
}
-static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset, unsigned int len)
+static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(wpc->inode);
if (WARN_ON_ONCE(offset >= isize))
return -EIO;
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
- return blkdev_iomap_begin(inode, offset, isize - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
static const struct iomap_writeback_ops blkdev_writeback_ops = {
- .map_blocks = blkdev_map_blocks,
+ .writeback_range = blkdev_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
};
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct iomap_writepage_ctx wpc = { };
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &blkdev_writeback_ops
+ };
- return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+ return iomap_writepages(&wpc);
}
const struct address_space_operations def_blk_aops = {
@@ -711,7 +726,8 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
- return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL);
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
+ NULL);
}
/*
@@ -841,7 +857,7 @@ reexpand:
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE)
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -850,11 +866,19 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
struct block_device *bdev = I_BDEV(inode);
loff_t end = start + len - 1;
loff_t isize;
+ unsigned int flags;
int error;
/* Fail if we don't recognize the flags. */
if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the device does not enable the
+ * unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(bdev))
+ return -EOPNOTSUPP;
/* Don't go off the end of the device. */
isize = bdev_nr_bytes(bdev);
@@ -877,48 +901,46 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
- /*
- * Invalidate the page cache, including dirty pages, for valid
- * de-allocate mode calls to fallocate().
- */
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+ flags = BLKDEV_ZERO_NOUNMAP;
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOFALLBACK);
+ flags = BLKDEV_ZERO_NOFALLBACK;
+ break;
+ case FALLOC_FL_WRITE_ZEROES:
+ flags = 0;
break;
default:
error = -EOPNOTSUPP;
+ goto fail;
}
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL, flags);
fail:
filemap_invalidate_unlock(inode->i_mapping);
inode_unlock(inode);
return error;
}
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+static int blkdev_mmap_prepare(struct vm_area_desc *desc)
{
- struct inode *bd_inode = bdev_file_inode(file);
+ struct file *file = desc->file;
- if (bdev_read_only(I_BDEV(bd_inode)))
- return generic_file_readonly_mmap(file, vma);
+ if (bdev_read_only(I_BDEV(bdev_file_inode(file))))
+ return generic_file_readonly_mmap_prepare(desc);
- return generic_file_mmap(file, vma);
+ return generic_file_mmap_prepare(desc);
}
const struct file_operations def_blk_fops = {
@@ -928,7 +950,7 @@ const struct file_operations def_blk_fops = {
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.iopoll = iocb_bio_iopoll,
- .mmap = blkdev_mmap,
+ .mmap_prepare = blkdev_mmap_prepare,
.fsync = blkdev_fsync,
.unlocked_ioctl = blkdev_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/block/ioctl.c b/block/ioctl.c
index e472cc1030c6..f7b0006ca45d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,6 +13,7 @@
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
+#include <linux/blk-integrity.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"
#include "blk-crypto-internal.h"
@@ -644,7 +645,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
case IOC_PR_CLEAR:
return blkdev_pr_clear(bdev, mode, argp);
default:
- return -ENOIOCTLCMD;
+ return blk_get_meta_cap(bdev, cmd, argp);
}
}
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 4dba8405bd01..70cbc7b2deb4 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -157,10 +157,7 @@ struct kyber_queue_data {
*/
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
- /*
- * Async request percentage, converted to per-word depth for
- * sbitmap_get_shallow().
- */
+ /* Number of allowed async requests. */
unsigned int async_depth;
struct kyber_cpu_latency __percpu *cpu_latency;
@@ -402,20 +399,13 @@ err:
return ERR_PTR(ret);
}
-static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
+static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
{
struct kyber_queue_data *kqd;
- struct elevator_queue *eq;
-
- eq = elevator_alloc(q, e);
- if (!eq)
- return -ENOMEM;
kqd = kyber_queue_data_alloc(q);
- if (IS_ERR(kqd)) {
- kobject_put(&eq->kobj);
+ if (IS_ERR(kqd))
return PTR_ERR(kqd);
- }
blk_stat_enable_accounting(q);
@@ -454,10 +444,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int shift = tags->bitmap_tags.sb.shift;
-
- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+ kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 2edf1cac06d5..b9b7cdf1d3c9 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -488,20 +488,6 @@ unlock:
}
/*
- * 'depth' is a number in the range 1..INT_MAX representing a number of
- * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
- * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
- * Values larger than q->nr_requests have the same effect as q->nr_requests.
- */
-static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
-{
- struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
- const unsigned int nrr = hctx->queue->nr_requests;
-
- return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
-}
-
-/*
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
* function is used by __blk_mq_get_tag().
*/
@@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* Throttle asynchronous requests and writes such that these requests
* do not block the allocation of synchronous requests.
*/
- data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth);
+ data->shallow_depth = dd->async_depth;
}
/* Called by blk_mq_update_nr_requests(). */
@@ -568,20 +554,14 @@ static void dd_exit_sched(struct elevator_queue *e)
/*
* initialize elevator private data (deadline_data).
*/
-static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
{
struct deadline_data *dd;
- struct elevator_queue *eq;
enum dd_prio prio;
- int ret = -ENOMEM;
-
- eq = elevator_alloc(q, e);
- if (!eq)
- return ret;
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
if (!dd)
- goto put_eq;
+ return -ENOMEM;
eq->elevator_data = dd;
@@ -608,10 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
q->elevator = eq;
return 0;
-
-put_eq:
- kobject_put(&eq->kobj);
- return ret;
}
/*
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 851db518ee5e..0c4ed9702146 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -56,7 +56,7 @@ static void t10_pi_generate(struct blk_integrity_iter *iter,
pi->ref_tag = 0;
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
}
@@ -105,7 +105,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
@@ -125,7 +125,7 @@ next:
static void t10_pi_type1_prepare(struct request *rq)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
@@ -177,7 +177,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
@@ -234,7 +234,7 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
put_unaligned_be48(0ULL, pi->ref_tag);
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
}
@@ -289,7 +289,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
@@ -299,7 +299,7 @@ next:
static void ext_pi_type1_prepare(struct request *rq)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
@@ -340,7 +340,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;