summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bio-integrity-auto.c4
-rw-r--r--block/bio-integrity.c3
-rw-r--r--block/bio.c20
-rw-r--r--block/blk-integrity.c70
-rw-r--r--block/blk-mq-cpumap.c46
-rw-r--r--block/blk-mq-dma.c161
-rw-r--r--block/blk-mq.c96
-rw-r--r--block/blk-settings.c126
-rw-r--r--block/blk-sysfs.c26
-rw-r--r--block/blk-zoned.c43
-rw-r--r--block/blk.h42
-rw-r--r--block/elevator.c29
-rw-r--r--block/fops.c108
-rw-r--r--block/ioctl.c3
-rw-r--r--block/t10-pi.c16
15 files changed, 639 insertions, 154 deletions
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
index 9c6657664792..687952f63bbb 100644
--- a/block/bio-integrity-auto.c
+++ b/block/bio-integrity-auto.c
@@ -54,10 +54,10 @@ static bool bi_offload_capable(struct blk_integrity *bi)
{
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
- return bi->tuple_size == sizeof(struct crc64_pi_tuple);
+ return bi->metadata_size == sizeof(struct crc64_pi_tuple);
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
- return bi->tuple_size == sizeof(struct t10_pi_tuple);
+ return bi->metadata_size == sizeof(struct t10_pi_tuple);
default:
pr_warn_once("%s: unknown integrity checksum type:%d\n",
__func__, bi->csum_type);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 10912988c8f5..6b077ca937f6 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -128,6 +128,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
if (bip->bip_vcnt > 0) {
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return 0;
+
if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
bip->bip_iter.bi_size += len;
return len;
diff --git a/block/bio.c b/block/bio.c
index 3c0a558c90f5..92c512e876c8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -930,8 +930,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
- if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
- return false;
if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
@@ -982,6 +980,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
WARN_ON_ONCE(bio_full(bio, len));
+ if (is_pci_p2pdma_page(page))
+ bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
@@ -1022,11 +1023,16 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_iter.bi_size > UINT_MAX - len)
return 0;
- if (bio->bi_vcnt > 0 &&
- bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset)) {
- bio->bi_iter.bi_size += len;
- return len;
+ if (bio->bi_vcnt > 0) {
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return 0;
+
+ if (bvec_try_merge_page(bv, page, len, offset)) {
+ bio->bi_iter.bi_size += len;
+ return len;
+ }
}
if (bio->bi_vcnt >= bio->bi_max_vecs)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index e4e2567061f9..056b8948369d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -13,6 +13,7 @@
#include <linux/scatterlist.h>
#include <linux/export.h>
#include <linux/slab.h>
+#include <linux/t10-pi.h>
#include "blk.h"
@@ -54,6 +55,73 @@ new_segment:
return segments;
}
+int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
+ struct logical_block_metadata_cap __user *argp)
+{
+ struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+ struct logical_block_metadata_cap meta_cap = {};
+ size_t usize = _IOC_SIZE(cmd);
+
+ if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) ||
+ _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) ||
+ _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) ||
+ _IOC_SIZE(cmd) < LBMD_SIZE_VER0)
+ return -ENOIOCTLCMD;
+
+ if (!bi)
+ goto out;
+
+ if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)
+ meta_cap.lbmd_flags |= LBMD_PI_CAP_INTEGRITY;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ meta_cap.lbmd_flags |= LBMD_PI_CAP_REFTAG;
+ meta_cap.lbmd_interval = 1 << bi->interval_exp;
+ meta_cap.lbmd_size = bi->metadata_size;
+ meta_cap.lbmd_pi_size = bi->pi_tuple_size;
+ meta_cap.lbmd_pi_offset = bi->pi_offset;
+ meta_cap.lbmd_opaque_size = bi->metadata_size - bi->pi_tuple_size;
+ if (meta_cap.lbmd_opaque_size && !bi->pi_offset)
+ meta_cap.lbmd_opaque_offset = bi->pi_tuple_size;
+
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_NONE:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_NONE;
+ break;
+ case BLK_INTEGRITY_CSUM_IP:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_IP;
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC16_T10DIF;
+ break;
+ case BLK_INTEGRITY_CSUM_CRC64:
+ meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC64_NVME;
+ break;
+ }
+
+ if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE)
+ meta_cap.lbmd_app_tag_size = 2;
+
+ if (bi->flags & BLK_INTEGRITY_REF_TAG) {
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ meta_cap.lbmd_ref_tag_size =
+ sizeof_field(struct crc64_pi_tuple, ref_tag);
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ meta_cap.lbmd_ref_tag_size =
+ sizeof_field(struct t10_pi_tuple, ref_tag);
+ break;
+ default:
+ break;
+ }
+ }
+
+out:
+ return copy_struct_to_user(argp, usize, &meta_cap, sizeof(meta_cap),
+ NULL);
+}
+
/**
* blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
* @rq: request to map
@@ -239,7 +307,7 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr,
{
struct blk_integrity *bi = dev_to_bi(dev);
- if (!bi->tuple_size)
+ if (!bi->metadata_size)
return sysfs_emit(page, "none\n");
return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi));
}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 444798c5374f..705da074ad6c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -12,16 +12,56 @@
#include <linux/cpu.h>
#include <linux/group_cpus.h>
#include <linux/device/bus.h>
+#include <linux/sched/isolation.h>
#include "blk.h"
#include "blk-mq.h"
+static unsigned int blk_mq_num_queues(const struct cpumask *mask,
+ unsigned int max_queues)
+{
+ unsigned int num;
+
+ num = cpumask_weight(mask);
+ return min_not_zero(num, max_queues);
+}
+
+/**
+ * blk_mq_num_possible_queues - Calc nr of queues for multiqueue devices
+ * @max_queues: The maximum number of queues the hardware/driver
+ * supports. If max_queues is 0, the argument is
+ * ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of possible CPUs.
+ */
+unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
+{
+ return blk_mq_num_queues(cpu_possible_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
+
+/**
+ * blk_mq_num_online_queues - Calc nr of queues for multiqueue devices
+ * @max_queues: The maximum number of queues the hardware/driver
+ * supports. If max_queues is 0, the argument is
+ * ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of online CPUs.
+ */
+unsigned int blk_mq_num_online_queues(unsigned int max_queues)
+{
+ return blk_mq_num_queues(cpu_online_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
+
void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
const struct cpumask *masks;
- unsigned int queue, cpu;
+ unsigned int queue, cpu, nr_masks;
- masks = group_cpus_evenly(qmap->nr_queues);
+ masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
if (!masks) {
for_each_possible_cpu(cpu)
qmap->mq_map[cpu] = qmap->queue_offset;
@@ -29,7 +69,7 @@ void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
}
for (queue = 0; queue < qmap->nr_queues; queue++) {
- for_each_cpu(cpu, &masks[queue])
+ for_each_cpu(cpu, &masks[queue % nr_masks])
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
kfree(masks);
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 82bae475dfa4..ad283017caef 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2025 Christoph Hellwig
*/
+#include <linux/blk-mq-dma.h>
#include "blk.h"
struct phys_vec {
@@ -61,6 +62,166 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
return true;
}
+/*
+ * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
+ * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
+ * we need to ensure our segments are aligned to this as well.
+ *
+ * Note that there is no point in using the slightly more complicated IOVA based
+ * path for single segment mappings.
+ */
+static inline bool blk_can_dma_map_iova(struct request *req,
+ struct device *dma_dev)
+{
+ return !((queue_virt_boundary(req->q) + 1) &
+ dma_get_merge_boundary(dma_dev));
+}
+
+static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+ iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
+ iter->len = vec->len;
+ return true;
+}
+
+static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
+ struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+ iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
+ offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
+ if (dma_mapping_error(dma_dev, iter->addr)) {
+ iter->status = BLK_STS_RESOURCE;
+ return false;
+ }
+ iter->len = vec->len;
+ return true;
+}
+
+static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter,
+ struct phys_vec *vec)
+{
+ enum dma_data_direction dir = rq_dma_dir(req);
+ unsigned int mapped = 0;
+ int error;
+
+ iter->addr = state->addr;
+ iter->len = dma_iova_size(state);
+
+ do {
+ error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
+ vec->len, dir, 0);
+ if (error)
+ break;
+ mapped += vec->len;
+ } while (blk_map_iter_next(req, &iter->iter, vec));
+
+ error = dma_iova_sync(dma_dev, state, 0, mapped);
+ if (error) {
+ iter->status = errno_to_blk_status(error);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * blk_rq_dma_map_iter_start - map the first DMA segment for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the
+ * caller and don't need to be initialized. @state needs to be stored for use
+ * at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+ unsigned int total_len = blk_rq_payload_bytes(req);
+ struct phys_vec vec;
+
+ iter->iter.bio = req->bio;
+ iter->iter.iter = req->bio->bi_iter;
+ memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
+ iter->status = BLK_STS_OK;
+
+ /*
+ * Grab the first segment ASAP because we'll need it to check for P2P
+ * transfers.
+ */
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
+ switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+ phys_to_page(vec.paddr))) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ return blk_dma_map_bus(iter, &vec);
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * P2P transfers through the host bridge are treated the
+ * same as non-P2P transfers below and during unmap.
+ */
+ req->cmd_flags &= ~REQ_P2PDMA;
+ break;
+ default:
+ iter->status = BLK_STS_INVAL;
+ return false;
+ }
+ }
+
+ if (blk_can_dma_map_iova(req, dma_dev) &&
+ dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
+ return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
+
+/**
+ * blk_rq_dma_map_iter_next - map the next DMA segment for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Iterate to the next mapping after a previous call to
+ * blk_rq_dma_map_iter_start(). See there for a detailed description of the
+ * arguments.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ */
+bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+ struct phys_vec vec;
+
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+ return blk_dma_map_bus(iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
+
static inline struct scatterlist *
blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4806b867e37d..9692fa4c3ef2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -883,7 +883,8 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -982,7 +983,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
/* Don't actually finish bio if it's part of flush sequence */
if (!bio->bi_iter.bi_size) {
- blk_zone_update_request_bio(req, bio);
+ if (blk_req_bio_is_zone_append(req, bio))
+ blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
}
@@ -3169,8 +3171,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
- if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
- goto queue_exit;
+ if (bio_needs_zone_write_plugging(bio)) {
+ if (blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+ }
new_request:
if (rq) {
@@ -4966,6 +4970,60 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
+/*
+ * Switch back to the elevator type stored in the xarray.
+ */
+static void blk_mq_elv_switch_back(struct request_queue *q,
+ struct xarray *elv_tbl)
+{
+ struct elevator_type *e = xa_load(elv_tbl, q->id);
+
+ /* The elv_update_nr_hw_queues unfreezes the queue. */
+ elv_update_nr_hw_queues(q, e);
+
+ /* Drop the reference acquired in blk_mq_elv_switch_none. */
+ if (e)
+ elevator_put(e);
+}
+
+/*
+ * Stores elevator type in xarray and set current elevator to none. It uses
+ * q->id as an index to store the elevator type into the xarray.
+ */
+static int blk_mq_elv_switch_none(struct request_queue *q,
+ struct xarray *elv_tbl)
+{
+ int ret = 0;
+
+ lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock);
+
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe here
+ * because we're called from nr_hw_queue update which is protected by
+ * set->update_nr_hwq_lock in the writer context. So, scheduler update/
+ * switch code (which acquires the same lock in the reader context)
+ * can't run concurrently.
+ */
+ if (q->elevator) {
+
+ ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(ret))
+ return ret;
+
+ /*
+ * Before we switch elevator to 'none', take a reference to
+ * the elevator module so that while nr_hw_queue update is
+ * running, no one can remove elevator module. We'd put the
+ * reference to elevator module later when we switch back
+ * elevator.
+ */
+ __elevator_get(q->elevator->type);
+
+ elevator_set_none(q);
+ }
+ return ret;
+}
+
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
@@ -4973,6 +5031,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
+ struct xarray elv_tbl;
lockdep_assert_held(&set->tag_list_lock);
@@ -4984,6 +5043,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
+
+ xa_init(&elv_tbl);
+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
@@ -4992,11 +5054,17 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue_nomemsave(q);
- if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue_nomemrestore(q);
- goto reregister;
- }
+ /*
+ * Switch IO scheduler to 'none', cleaning up the data associated
+ * with the previous scheduler. We will switch back once we are done
+ * updating the new sw to hw queue mappings.
+ */
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ if (blk_mq_elv_switch_none(q, &elv_tbl))
+ goto switch_back;
+
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
+ goto switch_back;
fallback:
blk_mq_update_queue_map(set);
@@ -5016,12 +5084,11 @@ fallback:
}
blk_mq_map_swqueue(q);
}
-
- /* elv_update_nr_hw_queues() unfreeze queue for us */
+switch_back:
+ /* The blk_mq_elv_switch_back unfreezes queue for us. */
list_for_each_entry(q, &set->tag_list, tag_set_list)
- elv_update_nr_hw_queues(q);
+ blk_mq_elv_switch_back(q, &elv_tbl);
-reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
@@ -5029,6 +5096,9 @@ reregister:
blk_mq_remove_hw_queues_cpuhp(q);
blk_mq_add_hw_queues_cpuhp(q);
}
+
+ xa_destroy(&elv_tbl);
+
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a000daafbfb4..91449147bae9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -14,6 +14,8 @@
#include <linux/jiffies.h>
#include <linux/gfp.h>
#include <linux/dma-mapping.h>
+#include <linux/t10-pi.h>
+#include <linux/crc64.h>
#include "blk.h"
#include "blk-rq-qos.h"
@@ -50,6 +52,8 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
+ lim->max_hw_wzeroes_unmap_sectors = UINT_MAX;
+ lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
}
@@ -114,7 +118,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
{
struct blk_integrity *bi = &lim->integrity;
- if (!bi->tuple_size) {
+ if (!bi->metadata_size) {
if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE ||
bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) {
pr_warn("invalid PI settings.\n");
@@ -135,6 +139,42 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
return -EINVAL;
}
+ if (bi->pi_tuple_size > bi->metadata_size) {
+ pr_warn("pi_tuple_size (%u) exceeds metadata_size (%u)\n",
+ bi->pi_tuple_size,
+ bi->metadata_size);
+ return -EINVAL;
+ }
+
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_NONE:
+ if (bi->pi_tuple_size) {
+ pr_warn("pi_tuple_size must be 0 when checksum type \
+ is none\n");
+ return -EINVAL;
+ }
+ break;
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) {
+ pr_warn("pi_tuple_size mismatch for T10 PI: expected \
+ %zu, got %u\n",
+ sizeof(struct t10_pi_tuple),
+ bi->pi_tuple_size);
+ return -EINVAL;
+ }
+ break;
+ case BLK_INTEGRITY_CSUM_CRC64:
+ if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) {
+ pr_warn("pi_tuple_size mismatch for CRC64 PI: \
+ expected %zu, got %u\n",
+ sizeof(struct crc64_pi_tuple),
+ bi->pi_tuple_size);
+ return -EINVAL;
+ }
+ break;
+ }
+
if (!bi->interval_exp)
bi->interval_exp = ilog2(lim->logical_block_size);
@@ -181,6 +221,8 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim)
static void blk_validate_atomic_write_limits(struct queue_limits *lim)
{
unsigned int boundary_sectors;
+ unsigned int atomic_write_hw_max_sectors =
+ lim->atomic_write_hw_max >> SECTOR_SHIFT;
if (!(lim->features & BLK_FEAT_ATOMIC_WRITES))
goto unsupported;
@@ -202,6 +244,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
lim->atomic_write_hw_max))
goto unsupported;
+ if (WARN_ON_ONCE(lim->chunk_sectors &&
+ atomic_write_hw_max_sectors > lim->chunk_sectors))
+ goto unsupported;
+
boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT;
if (boundary_sectors) {
@@ -333,6 +379,12 @@ int blk_validate_limits(struct queue_limits *lim)
if (!lim->max_segments)
lim->max_segments = BLK_MAX_SEGMENTS;
+ if (lim->max_hw_wzeroes_unmap_sectors &&
+ lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors)
+ return -EINVAL;
+ lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors,
+ lim->max_user_wzeroes_unmap_sectors);
+
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
@@ -418,10 +470,11 @@ int blk_set_default_limits(struct queue_limits *lim)
{
/*
* Most defaults are set by capping the bounds in blk_validate_limits,
- * but max_user_discard_sectors is special and needs an explicit
- * initialization to the max value here.
+ * but these limits are special and need an explicit initialization to
+ * the max value here.
*/
lim->max_user_discard_sectors = UINT_MAX;
+ lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
return blk_validate_limits(lim);
}
@@ -589,41 +642,50 @@ static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t,
return true;
}
-
-/* Check stacking of first bottom device */
-static bool blk_stack_atomic_writes_head(struct queue_limits *t,
- struct queue_limits *b)
+static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t)
{
- if (b->atomic_write_hw_boundary &&
- !blk_stack_atomic_writes_boundary_head(t, b))
- return false;
+ unsigned int chunk_bytes;
- if (t->io_min <= SECTOR_SIZE) {
- /* No chunk sectors, so use bottom device values directly */
- t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
- t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
- t->atomic_write_hw_max = b->atomic_write_hw_max;
- return true;
- }
+ if (!t->chunk_sectors)
+ return;
+
+ /*
+ * If chunk sectors is so large that its value in bytes overflows
+ * UINT_MAX, then just shift it down so it definitely will fit.
+ * We don't support atomic writes of such a large size anyway.
+ */
+ if (check_shl_overflow(t->chunk_sectors, SECTOR_SHIFT, &chunk_bytes))
+ chunk_bytes = t->chunk_sectors;
/*
* Find values for limits which work for chunk size.
* b->atomic_write_hw_unit_{min, max} may not be aligned with chunk
- * size (t->io_min), as chunk size is not restricted to a power-of-2.
+ * size, as the chunk size is not restricted to a power-of-2.
* So we need to find highest power-of-2 which works for the chunk
* size.
- * As an example scenario, we could have b->unit_max = 16K and
- * t->io_min = 24K. For this case, reduce t->unit_max to a value
- * aligned with both limits, i.e. 8K in this example.
+ * As an example scenario, we could have t->unit_max = 16K and
+ * t->chunk_sectors = 24KB. For this case, reduce t->unit_max to a
+ * value aligned with both limits, i.e. 8K in this example.
*/
- t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
- while (t->io_min % t->atomic_write_hw_unit_max)
- t->atomic_write_hw_unit_max /= 2;
+ t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max,
+ max_pow_of_two_factor(chunk_bytes));
- t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min,
+ t->atomic_write_hw_unit_min = min(t->atomic_write_hw_unit_min,
t->atomic_write_hw_unit_max);
- t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min);
+ t->atomic_write_hw_max = min(t->atomic_write_hw_max, chunk_bytes);
+}
+/* Check stacking of first bottom device */
+static bool blk_stack_atomic_writes_head(struct queue_limits *t,
+ struct queue_limits *b)
+{
+ if (b->atomic_write_hw_boundary &&
+ !blk_stack_atomic_writes_boundary_head(t, b))
+ return false;
+
+ t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max;
+ t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min;
+ t->atomic_write_hw_max = b->atomic_write_hw_max;
return true;
}
@@ -651,6 +713,7 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t,
if (!blk_stack_atomic_writes_head(t, b))
goto unsupported;
+ blk_stack_atomic_writes_chunk_sectors(t);
return;
unsupported:
@@ -708,6 +771,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
+ t->max_user_wzeroes_unmap_sectors =
+ min(t->max_user_wzeroes_unmap_sectors,
+ b->max_user_wzeroes_unmap_sectors);
+ t->max_hw_wzeroes_unmap_sectors =
+ min(t->max_hw_wzeroes_unmap_sectors,
+ b->max_hw_wzeroes_unmap_sectors);
+
t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
b->max_hw_zone_append_sectors);
@@ -875,7 +945,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
return true;
if (ti->flags & BLK_INTEGRITY_STACKED) {
- if (ti->tuple_size != bi->tuple_size)
+ if (ti->metadata_size != bi->metadata_size)
goto incompatible;
if (ti->interval_exp != bi->interval_exp)
goto incompatible;
@@ -891,7 +961,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
(bi->flags & BLK_INTEGRITY_REF_TAG);
ti->csum_type = bi->csum_type;
- ti->tuple_size = bi->tuple_size;
+ ti->metadata_size = bi->metadata_size;
ti->pi_offset = bi->pi_offset;
ti->interval_exp = bi->interval_exp;
ti->tag_size = bi->tag_size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c611444480b3..396cded255ea 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -161,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
@@ -205,6 +207,24 @@ static int queue_max_discard_sectors_store(struct gendisk *disk,
return 0;
}
+static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk,
+ const char *page, size_t count, struct queue_limits *lim)
+{
+ unsigned long max_zeroes_bytes, max_hw_zeroes_bytes;
+ ssize_t ret;
+
+ ret = queue_var_store(&max_zeroes_bytes, page, count);
+ if (ret < 0)
+ return ret;
+
+ max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT;
+ if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes)
+ return -EINVAL;
+
+ lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT;
+ return 0;
+}
+
static int
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
struct queue_limits *lim)
@@ -514,6 +534,10 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors,
+ "write_zeroes_unmap_max_hw_bytes");
+QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors,
+ "write_zeroes_unmap_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
@@ -662,6 +686,8 @@ static struct attribute *queue_attrs[] = {
&queue_atomic_write_unit_min_entry.attr,
&queue_atomic_write_unit_max_entry.attr,
&queue_max_write_zeroes_sectors_entry.attr,
+ &queue_max_hw_wzeroes_unmap_sectors_entry.attr,
+ &queue_max_wzeroes_unmap_sectors_entry.attr,
&queue_max_zone_append_sectors_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_rotational_entry.attr,
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 351d659280e1..ef43aaca49f4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -17,6 +17,8 @@
#include <linux/refcount.h>
#include <linux/mempool.h>
+#include <trace/events/block.h>
+
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-mq-debugfs.h"
@@ -177,6 +179,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev)
struct bio bio;
bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
+ trace_blkdev_zone_mgmt(&bio, 0);
return submit_bio_wait(&bio);
}
@@ -240,6 +243,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
cond_resched();
}
+ trace_blkdev_zone_mgmt(bio, nr_sectors);
ret = submit_bio_wait(bio);
bio_put(bio);
@@ -818,6 +822,8 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
* at the tail of the list to preserve the sequential write order.
*/
bio_list_add(&zwplug->bio_list, bio);
+ trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
+ bio->bi_iter.bi_sector, bio_sectors(bio));
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
@@ -1116,25 +1122,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
struct block_device *bdev = bio->bi_bdev;
- if (!bdev->bd_disk->zone_wplugs_hash)
- return false;
-
- /*
- * If the BIO already has the plugging flag set, then it was already
- * handled through this path and this is a submission from the zone
- * plug bio submit work.
- */
- if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
- return false;
-
- /*
- * We do not need to do anything special for empty flush BIOs, e.g
- * BIOs such as issued by blkdev_issue_flush(). The is because it is
- * the responsibility of the user to first wait for the completion of
- * write operations for flush to have any effect on the persistence of
- * the written data.
- */
- if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+ if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
return false;
/*
@@ -1205,6 +1193,20 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
spin_unlock_irqrestore(&zwplug->lock, flags);
}
+void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
+{
+ /*
+ * For zone append requests, the request sector indicates the location
+ * at which the BIO data was written. Return this value to the BIO
+ * issuer through the BIO iter sector.
+ * For plugged zone writes, which include emulated zone append, we need
+ * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+ * lookup the zone write plug.
+ */
+ bio->bi_iter.bi_sector = rq->__sector;
+ trace_blk_zone_append_update_request_bio(rq);
+}
+
void blk_zone_write_plug_bio_endio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
@@ -1299,6 +1301,9 @@ again:
goto put_zwplug;
}
+ trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
+ bio->bi_iter.bi_sector, bio_sectors(bio));
+
if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
blk_zone_wplug_bio_io_error(zwplug, bio);
goto again;
diff --git a/block/blk.h b/block/blk.h
index 37ec459fe656..76901a39997f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,15 @@
struct elevator_type;
+/*
+ * Default upper limit for the software max_sectors limit used for regular I/Os.
+ * This can be increased through sysfs.
+ *
+ * This should not be confused with the max_hw_sector limit that is entirely
+ * controlled by the block device driver, usually based on hardware limits.
+ */
+#define BLK_DEF_MAX_SECTORS_CAP (SZ_4M >> SECTOR_SHIFT)
+
#define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9)
#define BLK_MIN_SEGMENT_SIZE 4096
@@ -321,7 +330,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-void elv_update_nr_hw_queues(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);
@@ -467,23 +476,15 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
-void blk_zone_write_plug_bio_merged(struct bio *bio);
-void blk_zone_write_plug_init_request(struct request *rq);
-static inline void blk_zone_update_request_bio(struct request *rq,
- struct bio *bio)
+static inline bool blk_req_bio_is_zone_append(struct request *rq,
+ struct bio *bio)
{
- /*
- * For zone append requests, the request sector indicates the location
- * at which the BIO data was written. Return this value to the BIO
- * issuer through the BIO iter sector.
- * For plugged zone writes, which include emulated zone append, we need
- * the original BIO sector so that blk_zone_write_plug_bio_endio() can
- * lookup the zone write plug.
- */
- if (req_op(rq) == REQ_OP_ZONE_APPEND ||
- bio_flagged(bio, BIO_EMULATES_ZONE_APPEND))
- bio->bi_iter.bi_sector = rq->__sector;
+ return req_op(rq) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_init_request(struct request *rq);
+void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio);
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
@@ -516,14 +517,19 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return false;
}
+static inline bool blk_req_bio_is_zone_append(struct request *req,
+ struct bio *bio)
+{
+ return false;
+}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
-static inline void blk_zone_update_request_bio(struct request *rq,
- struct bio *bio)
+static inline void blk_zone_append_update_request_bio(struct request *rq,
+ struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
diff --git a/block/elevator.c b/block/elevator.c
index ab22542e6cf0..88f8f36bed98 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -689,21 +689,21 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
* The I/O scheduler depends on the number of hardware queues, this forces a
* reattachment when nr_hw_queues changes.
*/
-void elv_update_nr_hw_queues(struct request_queue *q)
+void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
{
struct elv_change_ctx ctx = {};
int ret = -ENODEV;
WARN_ON_ONCE(q->mq_freeze_depth == 0);
- mutex_lock(&q->elevator_lock);
- if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
- ctx.name = q->elevator->type->elevator_name;
+ if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
+ ctx.name = e->elevator_name;
+ mutex_lock(&q->elevator_lock);
/* force to reattach elevator after nr_hw_queue is updated */
ret = elevator_switch(q, &ctx);
+ mutex_unlock(&q->elevator_lock);
}
- mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue_nomemrestore(q);
if (!ret)
WARN_ON_ONCE(elevator_change_done(q, &ctx));
@@ -719,7 +719,8 @@ void elevator_set_default(struct request_queue *q)
.name = "mq-deadline",
.no_uevent = true,
};
- int err = 0;
+ int err;
+ struct elevator_type *e;
/* now we allow to switch elevator */
blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
@@ -732,12 +733,18 @@ void elevator_set_default(struct request_queue *q)
* have multiple queues or mq-deadline is not available, default
* to "none".
*/
- if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
- blk_mq_is_shared_tags(q->tag_set->flags)))
+ e = elevator_find_get(ctx.name);
+ if (!e)
+ return;
+
+ if ((q->nr_hw_queues == 1 ||
+ blk_mq_is_shared_tags(q->tag_set->flags))) {
err = elevator_change(q, &ctx);
- if (err < 0)
- pr_warn("\"%s\" elevator initialization, failed %d, "
- "falling back to \"none\"\n", ctx.name, err);
+ if (err < 0)
+ pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
+ ctx.name, err);
+ }
+ elevator_put(e);
}
void elevator_set_none(struct request_queue *q)
diff --git a/block/fops.c b/block/fops.c
index 1309861d4c2c..82451ac8ff25 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -496,18 +496,21 @@ static void blkdev_readahead(struct readahead_control *rac)
mpage_readahead(rac, blkdev_get_block);
}
-static int blkdev_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int blkdev_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
}
-static int blkdev_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct folio *folio,
- void *fsdata)
+static int blkdev_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = block_write_end(pos, len, copied, folio);
folio_unlock(folio);
folio_put(folio);
@@ -537,30 +540,42 @@ static void blkdev_readahead(struct readahead_control *rac)
iomap_readahead(rac, &blkdev_iomap_ops);
}
-static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset, unsigned int len)
+static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(wpc->inode);
if (WARN_ON_ONCE(offset >= isize))
return -EIO;
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
- return blkdev_iomap_begin(inode, offset, isize - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
static const struct iomap_writeback_ops blkdev_writeback_ops = {
- .map_blocks = blkdev_map_blocks,
+ .writeback_range = blkdev_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
};
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct iomap_writepage_ctx wpc = { };
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &blkdev_writeback_ops
+ };
- return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+ return iomap_writepages(&wpc);
}
const struct address_space_operations def_blk_aops = {
@@ -711,7 +726,8 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
- return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL);
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
+ NULL);
}
/*
@@ -841,7 +857,7 @@ reexpand:
#define BLKDEV_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_ZERO_RANGE)
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)
static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
@@ -850,11 +866,19 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
struct block_device *bdev = I_BDEV(inode);
loff_t end = start + len - 1;
loff_t isize;
+ unsigned int flags;
int error;
/* Fail if we don't recognize the flags. */
if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the device does not enable the
+ * unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(bdev))
+ return -EOPNOTSUPP;
/* Don't go off the end of the device. */
isize = bdev_nr_bytes(bdev);
@@ -877,48 +901,46 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
- /*
- * Invalidate the page cache, including dirty pages, for valid
- * de-allocate mode calls to fallocate().
- */
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+ flags = BLKDEV_ZERO_NOUNMAP;
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
- if (error)
- goto fail;
-
- error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
- len >> SECTOR_SHIFT, GFP_KERNEL,
- BLKDEV_ZERO_NOFALLBACK);
+ flags = BLKDEV_ZERO_NOFALLBACK;
+ break;
+ case FALLOC_FL_WRITE_ZEROES:
+ flags = 0;
break;
default:
error = -EOPNOTSUPP;
+ goto fail;
}
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL, flags);
fail:
filemap_invalidate_unlock(inode->i_mapping);
inode_unlock(inode);
return error;
}
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+static int blkdev_mmap_prepare(struct vm_area_desc *desc)
{
- struct inode *bd_inode = bdev_file_inode(file);
+ struct file *file = desc->file;
- if (bdev_read_only(I_BDEV(bd_inode)))
- return generic_file_readonly_mmap(file, vma);
+ if (bdev_read_only(I_BDEV(bdev_file_inode(file))))
+ return generic_file_readonly_mmap_prepare(desc);
- return generic_file_mmap(file, vma);
+ return generic_file_mmap_prepare(desc);
}
const struct file_operations def_blk_fops = {
@@ -928,7 +950,7 @@ const struct file_operations def_blk_fops = {
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.iopoll = iocb_bio_iopoll,
- .mmap = blkdev_mmap,
+ .mmap_prepare = blkdev_mmap_prepare,
.fsync = blkdev_fsync,
.unlocked_ioctl = blkdev_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/block/ioctl.c b/block/ioctl.c
index e472cc1030c6..f7b0006ca45d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,6 +13,7 @@
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
+#include <linux/blk-integrity.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"
#include "blk-crypto-internal.h"
@@ -644,7 +645,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
case IOC_PR_CLEAR:
return blkdev_pr_clear(bdev, mode, argp);
default:
- return -ENOIOCTLCMD;
+ return blk_get_meta_cap(bdev, cmd, argp);
}
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 851db518ee5e..0c4ed9702146 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -56,7 +56,7 @@ static void t10_pi_generate(struct blk_integrity_iter *iter,
pi->ref_tag = 0;
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
}
@@ -105,7 +105,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
@@ -125,7 +125,7 @@ next:
static void t10_pi_type1_prepare(struct request *rq)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
@@ -177,7 +177,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
@@ -234,7 +234,7 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
put_unaligned_be48(0ULL, pi->ref_tag);
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
}
@@ -289,7 +289,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += bi->tuple_size;
+ iter->prot_buf += bi->metadata_size;
iter->seed++;
}
@@ -299,7 +299,7 @@ next:
static void ext_pi_type1_prepare(struct request *rq)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
@@ -340,7 +340,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
- const int tuple_sz = bi->tuple_size;
+ const int tuple_sz = bi->metadata_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;