summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bio-integrity.c3
-rw-r--r--block/bio.c20
-rw-r--r--block/blk-mq-cpumap.c46
-rw-r--r--block/blk-mq-debugfs.c12
-rw-r--r--block/blk-mq-dma.c161
-rw-r--r--block/blk-mq.c96
-rw-r--r--block/blk-settings.c62
-rw-r--r--block/blk-zoned.c43
-rw-r--r--block/blk.h42
-rw-r--r--block/elevator.c10
10 files changed, 400 insertions, 95 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 10912988c8f5..6b077ca937f6 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -128,6 +128,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
if (bip->bip_vcnt > 0) {
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return 0;
+
if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
bip->bip_iter.bi_size += len;
return len;
diff --git a/block/bio.c b/block/bio.c
index 3c0a558c90f5..92c512e876c8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -930,8 +930,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
- if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
- return false;
if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
@@ -982,6 +980,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
WARN_ON_ONCE(bio_full(bio, len));
+ if (is_pci_p2pdma_page(page))
+ bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
@@ -1022,11 +1023,16 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_iter.bi_size > UINT_MAX - len)
return 0;
- if (bio->bi_vcnt > 0 &&
- bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset)) {
- bio->bi_iter.bi_size += len;
- return len;
+ if (bio->bi_vcnt > 0) {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return 0;
+
+ if (bvec_try_merge_page(bv, page, len, offset)) {
+ bio->bi_iter.bi_size += len;
+ return len;
+ }
}
if (bio->bi_vcnt >= bio->bi_max_vecs)
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 444798c5374f..705da074ad6c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -12,16 +12,56 @@
#include <linux/cpu.h>
#include <linux/group_cpus.h>
#include <linux/device/bus.h>
+#include <linux/sched/isolation.h>
#include "blk.h"
#include "blk-mq.h"
+static unsigned int blk_mq_num_queues(const struct cpumask *mask,
+ unsigned int max_queues)
+{
+ unsigned int num;
+
+ num = cpumask_weight(mask);
+ return min_not_zero(num, max_queues);
+}
+
+/**
+ * blk_mq_num_possible_queues - Calc nr of queues for multiqueue devices
+ * @max_queues: The maximum number of queues the hardware/driver
+ * supports. If max_queues is 0, the argument is
+ * ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of possible CPUs.
+ */
+unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
+{
+ return blk_mq_num_queues(cpu_possible_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
+
+/**
+ * blk_mq_num_online_queues - Calc nr of queues for multiqueue devices
+ * @max_queues: The maximum number of queues the hardware/driver
+ * supports. If max_queues is 0, the argument is
+ * ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of online CPUs.
+ */
+unsigned int blk_mq_num_online_queues(unsigned int max_queues)
+{
+ return blk_mq_num_queues(cpu_online_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
+
void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
const struct cpumask *masks;
- unsigned int queue, cpu;
+ unsigned int queue, cpu, nr_masks;
- masks = group_cpus_evenly(qmap->nr_queues);
+ masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
if (!masks) {
for_each_possible_cpu(cpu)
qmap->mq_map[cpu] = qmap->queue_offset;
@@ -29,7 +69,7 @@ void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
}
for (queue = 0; queue < qmap->nr_queues; queue++) {
- for_each_cpu(cpu, &masks[queue])
+ for_each_cpu(cpu, &masks[queue % nr_masks])
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
kfree(masks);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 29b3540dd180..7ed3e71f2fc0 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -521,7 +521,7 @@ CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
- void *data = d_inode(m->file->f_path.dentry->d_parent)->i_private;
+ void *data = debugfs_get_aux(m->file);
return attr->show(data, m);
}
@@ -531,7 +531,7 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
{
struct seq_file *m = file->private_data;
const struct blk_mq_debugfs_attr *attr = m->private;
- void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
+ void *data = debugfs_get_aux(file);
/*
* Attributes that only implement .seq_ops are read-only and 'attr' is
@@ -546,7 +546,7 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
static int blk_mq_debugfs_open(struct inode *inode, struct file *file)
{
const struct blk_mq_debugfs_attr *attr = inode->i_private;
- void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
+ void *data = debugfs_get_aux(file);
struct seq_file *m;
int ret;
@@ -612,11 +612,9 @@ static void debugfs_create_files(struct dentry *parent, void *data,
if (IS_ERR_OR_NULL(parent))
return;
- d_inode(parent)->i_private = data;
-
for (; attr->name; attr++)
- debugfs_create_file(attr->name, attr->mode, parent,
- (void *)attr, &blk_mq_debugfs_fops);
+ debugfs_create_file_aux(attr->name, attr->mode, parent,
+ (void *)attr, data, &blk_mq_debugfs_fops);
}
void blk_mq_debugfs_register(struct request_queue *q)
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 82bae475dfa4..ad283017caef 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2025 Christoph Hellwig
*/
+#include <linux/blk-mq-dma.h>
#include "blk.h"
struct phys_vec {
@@ -61,6 +62,166 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
return true;
}
+/*
+ * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
+ * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
+ * we need to ensure our segments are aligned to this as well.
+ *
+ * Note that there is no point in using the slightly more complicated IOVA based
+ * path for single segment mappings.
+ */
+static inline bool blk_can_dma_map_iova(struct request *req,
+ struct device *dma_dev)
+{
+ return !((queue_virt_boundary(req->q) + 1) &
+ dma_get_merge_boundary(dma_dev));
+}
+
+static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+ iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
+ iter->len = vec->len;
+ return true;
+}
+
+static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
+ struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+ iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
+ offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
+ if (dma_mapping_error(dma_dev, iter->addr)) {
+ iter->status = BLK_STS_RESOURCE;
+ return false;
+ }
+ iter->len = vec->len;
+ return true;
+}
+
+static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter,
+ struct phys_vec *vec)
+{
+ enum dma_data_direction dir = rq_dma_dir(req);
+ unsigned int mapped = 0;
+ int error;
+
+ iter->addr = state->addr;
+ iter->len = dma_iova_size(state);
+
+ do {
+ error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
+ vec->len, dir, 0);
+ if (error)
+ break;
+ mapped += vec->len;
+ } while (blk_map_iter_next(req, &iter->iter, vec));
+
+ error = dma_iova_sync(dma_dev, state, 0, mapped);
+ if (error) {
+ iter->status = errno_to_blk_status(error);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * blk_rq_dma_map_iter_start - map the first DMA segment for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the
+ * caller and don't need to be initialized. @state needs to be stored for use
+ * at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+ unsigned int total_len = blk_rq_payload_bytes(req);
+ struct phys_vec vec;
+
+ iter->iter.bio = req->bio;
+ iter->iter.iter = req->bio->bi_iter;
+ memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
+ iter->status = BLK_STS_OK;
+
+ /*
+ * Grab the first segment ASAP because we'll need it to check for P2P
+ * transfers.
+ */
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
+ switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+ phys_to_page(vec.paddr))) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ return blk_dma_map_bus(iter, &vec);
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * P2P transfers through the host bridge are treated the
+ * same as non-P2P transfers below and during unmap.
+ */
+ req->cmd_flags &= ~REQ_P2PDMA;
+ break;
+ default:
+ iter->status = BLK_STS_INVAL;
+ return false;
+ }
+ }
+
+ if (blk_can_dma_map_iova(req, dma_dev) &&
+ dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
+ return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
+
+/**
+ * blk_rq_dma_map_iter_next - map the next DMA segment for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Iterate to the next mapping after a previous call to
+ * blk_rq_dma_map_iter_start(). See there for a detailed description of the
+ * arguments.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ */
+bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+ struct phys_vec vec;
+
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+ return blk_dma_map_bus(iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
+
static inline struct scatterlist *
blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4806b867e37d..9692fa4c3ef2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -883,7 +883,8 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -982,7 +983,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
/* Don't actually finish bio if it's part of flush sequence */
if (!bio->bi_iter.bi_size) {
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
}
@@ -3169,8 +3171,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
- if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
- goto queue_exit;
+ if (bio_needs_zone_write_plugging(bio)) {
+ if (blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+ }
new_request:
if (rq) {
@@ -4966,6 +4970,60 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
+/*
+ * Switch back to the elevator type stored in the xarray.
+ */
+static void blk_mq_elv_switch_back(struct request_queue *q,
+ struct xarray *elv_tbl)
+{
+ struct elevator_type *e = xa_load(elv_tbl, q->id);
+
+ /* The elv_update_nr_hw_queues unfreezes the queue. */
+ elv_update_nr_hw_queues(q, e);
+
+ /* Drop the reference acquired in blk_mq_elv_switch_none. */
+ if (e)
+ elevator_put(e);
+}
+
+/*
+ * Stores elevator type in xarray and set current elevator to none. It uses
+ * q->id as an index to store the elevator type into the xarray.
+ */
+static int blk_mq_elv_switch_none(struct request_queue *q,
+ struct xarray *elv_tbl)
+{
+ int ret = 0;
+
+ lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);
+
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe here
+ * because we're called from nr_hw_queue update which is protected by
+ * set->update_nr_hwq_lock in the writer context. So, scheduler update/
+ * switch code (which acquires the same lock in the reader context)
+ * can't run concurrently.
+ */
+ if (q->elevator) {
+
+ ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(ret))
+ return ret;
+
+ /*
+ * Before we switch elevator to 'none', take a reference to
+ * the elevator module so that while nr_hw_queue update is
+ * running, no one can remove elevator module. We'd put the
+ * reference to elevator module later when we switch back
+ * elevator.
+ */
+ __elevator_get(q->elevator->type);
+
+ elevator_set_none(q);
+ }
+ return ret;
+}
+
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
@@ -4973,6 +5031,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
+ struct xarray elv_tbl;
lockdep_assert_held(&set->tag_list_lock);
@@ -4984,6 +5043,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
+
+ xa_init(&elv_tbl);
+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
@@ -4992,11 +5054,17 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue_nomemsave(q);
- if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue_nomemrestore(q);
- goto reregister;
- }
+ /*
+ * Switch IO scheduler to 'none', cleaning up the data associated
+ * with the previous scheduler. We will switch back once we are done
+ * updating the new sw to hw queue mappings.
+ */
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ if (blk_mq_elv_switch_none(q, &elv_tbl))
+ goto switch_back;
+
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
+ goto switch_back;
fallback:
blk_mq_update_queue_map(set);
@@ -5016,12 +5084,11 @@ fallback:
}
blk_mq_map_swqueue(q);
}
-
- /* elv_update_nr_hw_queues() unfreeze queue for us */
+switch_back:
+ /* The blk_mq_elv_switch_back unfreezes queue for us. */
list_for_each_entry(q, &set->tag_list, tag_set_list)
- elv_update_nr_hw_queues(q);
+ blk_mq_elv_switch_back(q, &elv_tbl);
-reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
@@ -5029,6 +5096,9 @@ reregister:
blk_mq_remove_hw_queues_cpuhp(q);
blk_mq_add_hw_queues_cpuhp(q);
}
+
+ xa_destroy(&elv_tbl);
+
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index e0cc7a1fe4ac..91449147bae9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -221,6 +221,8 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim)
static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
unsigned int boundary_sectors;
+ unsigned int atomic_write_hw_max_sectors =
+ lim->atomic_write_hw_max >> SECTOR_SHIFT;
if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
goto unsupported;
@@ -242,6 +244,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
lim->atomic_write_hw_max))
goto unsupported;
+ if (WARN_ON_ONCE(lim->chunk_sectors &&
+ atomic_write_hw_max_sectors > lim->chunk_sectors))
+ goto unsupported;
+
boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
if (boundary_sectors) {
@@ -636,41 +642,50 @@ static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
return true;
}
-
-/* Check stacking of first bottom device */
-static bool blk_stack_atomic_writes_head(struct queue_limits *t,
- struct queue_limits *b)
+static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
{
- if (b->atomic_write_hw_boundary &&
- !blk_stack_atomic_writes_boundary_head(t, b))
- return false;
+ unsigned int chunk_bytes;
- if (t->io_min <= SECTOR_SIZE) {
- /* No chunk sectors, so use bottom device values directly */
- t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
- t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
- t->atomic_write_hw_max = b->atomic_write_hw_max;
- return true;
- }
+ if (!t->chunk_sectors)
+ return;
+
+ /*
+ * If chunk sectors is so large that its value in bytes overflows
+ * UINT_MAX, then just shift it down so it definitely will fit.
+ * We don't support atomic writes of such a large size anyway.
+ */
+ if (check_shl_overflow(t->chunk_sectors, SECTOR_SHIFT, &chunk_bytes))
+ chunk_bytes = t->chunk_sectors;
/*
* Find values for limits which work for chunk size.
* b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
- * size (t->io_min), as chunk size is not restricted to a power-of-2.
+ * size, as the chunk size is not restricted to a power-of-2.
* So we need to find highest power-of-2 which works for the chunk
* size.
- * As an example scenario, we could have b->unit_max = 16K and
- * t->io_min = 24K. For this case, reduce t->unit_max to a value
- * aligned with both limits, i.e. 8K in this example.
+ * As an example scenario, we could have t->unit_max = 16K and
+ * t->chunk_sectors = 24KB. For this case, reduce t->unit_max to a
+ * value aligned with both limits, i.e. 8K in this example.
*/
- t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
- while (t->io_min % t->atomic_write_hw_unit_max)
- t->atomic_write_hw_unit_max /= 2;
+ t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
+ max_pow_of_two_factor(chunk_bytes));
- t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min,
+ t->atomic_write_hw_unit_min = min(t->atomic_write_hw_unit_min,
t->atomic_write_hw_unit_max);
- t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min);
+ t->atomic_write_hw_max = min(t->atomic_write_hw_max, chunk_bytes);
+}
+/* Check stacking of first bottom device */
+static bool blk_stack_atomic_writes_head(struct queue_limits *t,
+ struct queue_limits *b)
+{
+ if (b->atomic_write_hw_boundary &&
+ !blk_stack_atomic_writes_boundary_head(t, b))
+ return false;
+
+ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+ t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
+ t->atomic_write_hw_max = b->atomic_write_hw_max;
return true;
}
@@ -698,6 +713,7 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t,
if (!blk_stack_atomic_writes_head(t, b))
goto unsupported;
+ blk_stack_atomic_writes_chunk_sectors(t);
return;
unsupported:
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 351d659280e1..ef43aaca49f4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -17,6 +17,8 @@
#include <linux/refcount.h>
#include <linux/mempool.h>
+#include <trace/events/block.h>
+
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-mq-debugfs.h"
@@ -177,6 +179,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev)
struct bio bio;
bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
+ trace_blkdev_zone_mgmt(&bio, 0);
return submit_bio_wait(&bio);
}
@@ -240,6 +243,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
cond_resched();
}
+ trace_blkdev_zone_mgmt(bio, nr_sectors);
ret = submit_bio_wait(bio);
bio_put(bio);
@@ -818,6 +822,8 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
* at the tail of the list to preserve the sequential write order.
*/
bio_list_add(&zwplug->bio_list, bio);
+ trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
+ bio->bi_iter.bi_sector, bio_sectors(bio));
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
@@ -1116,25 +1122,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
struct block_device *bdev = bio->bi_bdev;
- if (!bdev->bd_disk->zone_wplugs_hash)
- return false;
-
- /*
- * If the BIO already has the plugging flag set, then it was already
- * handled through this path and this is a submission from the zone
- * plug bio submit work.
- */
- if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
- return false;
-
- /*
- * We do not need to do anything special for empty flush BIOs, e.g
- * BIOs such as issued by blkdev_issue_flush(). The is because it is
- * the responsibility of the user to first wait for the completion of
- * write operations for flush to have any effect on the persistence of
- * the written data.
- */
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
return false;
/*
@@ -1205,6 +1193,20 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
spin_unlock_irqrestore(&zwplug->lock, flags);
}
+void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
+{
+ /*
+ * For zone append requests, the request sector indicates the location
+ * at which the BIO data was written. Return this value to the BIO
+ * issuer through the BIO iter sector.
+ * For plugged zone writes, which include emulated zone append, we need
+ * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+ * lookup the zone write plug.
+ */
+ bio->bi_iter.bi_sector = rq->__sector;
+ trace_blk_zone_append_update_request_bio(rq);
+}
+
void blk_zone_write_plug_bio_endio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
@@ -1299,6 +1301,9 @@ again:
goto put_zwplug;
}
+ trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
+ bio->bi_iter.bi_sector, bio_sectors(bio));
+
if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
blk_zone_wplug_bio_io_error(zwplug, bio);
goto again;
diff --git a/block/blk.h b/block/blk.h
index 37ec459fe656..76901a39997f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,15 @@
struct elevator_type;
+/*
+ * Default upper limit for the software max_sectors limit used for regular I/Os.
+ * This can be increased through sysfs.
+ *
+ * This should not be confused with the max_hw_sector limit that is entirely
+ * controlled by the block device driver, usually based on hardware limits.
+ */
+#define BLK_DEF_MAX_SECTORS_CAP (SZ_4M >> SECTOR_SHIFT)
+
#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
#define BLK_MIN_SEGMENT_SIZE 4096
@@ -321,7 +330,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-void elv_update_nr_hw_queues(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);
@@ -467,23 +476,15 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
-void blk_zone_write_plug_bio_merged(struct bio *bio);
-void blk_zone_write_plug_init_request(struct request *rq);
-static inline void blk_zone_update_request_bio(struct request *rq,
- struct bio *bio)
+static inline bool blk_req_bio_is_zone_append(struct request *rq,
+ struct bio *bio)
{
- /*
- * For zone append requests, the request sector indicates the location
- * at which the BIO data was written. Return this value to the BIO
- * issuer through the BIO iter sector.
- * For plugged zone writes, which include emulated zone append, we need
- * the original BIO sector so that blk_zone_write_plug_bio_endio() can
- * lookup the zone write plug.
- */
- if (req_op(rq) == REQ_OP_ZONE_APPEND ||
- bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
- bio->bi_iter.bi_sector = rq->__sector;
+ return req_op(rq) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_init_request(struct request *rq);
+void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio);
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
@@ -516,14 +517,19 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return false;
}
+static inline bool blk_req_bio_is_zone_append(struct request *req,
+ struct bio *bio)
+{
+ return false;
+}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
-static inline void blk_zone_update_request_bio(struct request *rq,
- struct bio *bio)
+static inline void blk_zone_append_update_request_bio(struct request *rq,
+ struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
diff --git a/block/elevator.c b/block/elevator.c
index a960bdc869bc..88f8f36bed98 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -689,21 +689,21 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
* The I/O scheduler depends on the number of hardware queues, this forces a
* reattachment when nr_hw_queues changes.
*/
-void elv_update_nr_hw_queues(struct request_queue *q)
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
{
struct elv_change_ctx ctx = {};
int ret = -ENODEV;
WARN_ON_ONCE(q->mq_freeze_depth == 0);
- mutex_lock(&q->elevator_lock);
- if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
- ctx.name = q->elevator->type->elevator_name;
+ if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
+ ctx.name = e->elevator_name;
+ mutex_lock(&q->elevator_lock);
/* force to reattach elevator after nr_hw_queue is updated */
ret = elevator_switch(q, &ctx);
+ mutex_unlock(&q->elevator_lock);
}
- mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue_nomemrestore(q);
if (!ret)
WARN_ON_ONCE(elevator_change_done(q, &ctx));