diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bio-integrity-auto.c | 4 | ||||
-rw-r--r-- | block/bio-integrity.c | 3 | ||||
-rw-r--r-- | block/bio.c | 20 | ||||
-rw-r--r-- | block/blk-integrity.c | 70 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 46 | ||||
-rw-r--r-- | block/blk-mq-dma.c | 161 | ||||
-rw-r--r-- | block/blk-mq.c | 96 | ||||
-rw-r--r-- | block/blk-settings.c | 126 | ||||
-rw-r--r-- | block/blk-sysfs.c | 26 | ||||
-rw-r--r-- | block/blk-zoned.c | 43 | ||||
-rw-r--r-- | block/blk.h | 42 | ||||
-rw-r--r-- | block/elevator.c | 29 | ||||
-rw-r--r-- | block/fops.c | 108 | ||||
-rw-r--r-- | block/ioctl.c | 3 | ||||
-rw-r--r-- | block/t10-pi.c | 16 |
15 files changed, 639 insertions, 154 deletions
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 9c6657664792..687952f63bbb 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -54,10 +54,10 @@ static bool bi_offload_capable(struct blk_integrity *bi) { switch (bi->csum_type) { case BLK_INTEGRITY_CSUM_CRC64: - return bi->tuple_size == sizeof(struct crc64_pi_tuple); + return bi->metadata_size == sizeof(struct crc64_pi_tuple); case BLK_INTEGRITY_CSUM_CRC: case BLK_INTEGRITY_CSUM_IP: - return bi->tuple_size == sizeof(struct t10_pi_tuple); + return bi->metadata_size == sizeof(struct t10_pi_tuple); default: pr_warn_once("%s: unknown integrity checksum type:%d\n", __func__, bi->csum_type); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 10912988c8f5..6b077ca937f6 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -128,6 +128,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, if (bip->bip_vcnt > 0) { struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + if (bvec_try_merge_hw_page(q, bv, page, len, offset)) { bip->bip_iter.bi_size += len; return len; diff --git a/block/bio.c b/block/bio.c index 3c0a558c90f5..92c512e876c8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -930,8 +930,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; - if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) - return false; if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) @@ -982,6 +980,9 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); + if (is_pci_p2pdma_page(page)) + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; @@ -1022,11 +1023,16 @@ int bio_add_page(struct bio *bio, struct page *page, if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; - if (bio->bi_vcnt > 0 && - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset)) { - bio->bi_iter.bi_size += len; - return len; + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + + if (bvec_try_merge_page(bv, page, len, offset)) { + bio->bi_iter.bi_size += len; + return len; + } } if (bio->bi_vcnt >= bio->bi_max_vecs) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index e4e2567061f9..056b8948369d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -13,6 +13,7 @@ #include <linux/scatterlist.h> #include <linux/export.h> #include <linux/slab.h> +#include <linux/t10-pi.h> #include "blk.h" @@ -54,6 +55,73 @@ new_segment: return segments; } +int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, + struct logical_block_metadata_cap __user *argp) +{ + struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); + struct logical_block_metadata_cap meta_cap = {}; + size_t usize = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) || + _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) || + _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) || + _IOC_SIZE(cmd) < LBMD_SIZE_VER0) + return -ENOIOCTLCMD; + + if (!bi) + goto out; + + if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) + meta_cap.lbmd_flags |= LBMD_PI_CAP_INTEGRITY; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + meta_cap.lbmd_flags |= LBMD_PI_CAP_REFTAG; + meta_cap.lbmd_interval = 1 << bi->interval_exp; + meta_cap.lbmd_size = bi->metadata_size; + meta_cap.lbmd_pi_size = bi->pi_tuple_size; + meta_cap.lbmd_pi_offset = bi->pi_offset; + meta_cap.lbmd_opaque_size = bi->metadata_size - bi->pi_tuple_size; + if (meta_cap.lbmd_opaque_size && !bi->pi_offset) + meta_cap.lbmd_opaque_offset = bi->pi_tuple_size; + + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_NONE: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_NONE; + break; + case BLK_INTEGRITY_CSUM_IP: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_IP; + break; + case BLK_INTEGRITY_CSUM_CRC: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC16_T10DIF; + break; + case BLK_INTEGRITY_CSUM_CRC64: + meta_cap.lbmd_guard_tag_type = LBMD_PI_CSUM_CRC64_NVME; + break; + } + + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) + meta_cap.lbmd_app_tag_size = 2; + + if (bi->flags & BLK_INTEGRITY_REF_TAG) { + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + meta_cap.lbmd_ref_tag_size = + sizeof_field(struct crc64_pi_tuple, ref_tag); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + meta_cap.lbmd_ref_tag_size = + sizeof_field(struct t10_pi_tuple, ref_tag); + break; + default: + break; + } + } + +out: + return copy_struct_to_user(argp, usize, &meta_cap, sizeof(meta_cap), + NULL); +} + /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist * @rq: request to map @@ -239,7 +307,7 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr, { struct blk_integrity *bi = dev_to_bi(dev); - if (!bi->tuple_size) + if (!bi->metadata_size) return sysfs_emit(page, "none\n"); return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 444798c5374f..705da074ad6c 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -12,16 +12,56 @@ #include <linux/cpu.h> #include <linux/group_cpus.h> #include <linux/device/bus.h> +#include <linux/sched/isolation.h> #include "blk.h" #include "blk-mq.h" +static unsigned int blk_mq_num_queues(const struct cpumask *mask, + unsigned int max_queues) +{ + unsigned int num; + + num = cpumask_weight(mask); + return min_not_zero(num, max_queues); +} + +/** + * blk_mq_num_possible_queues - Calc nr of queues for multiqueue devices + * @max_queues: The maximum number of queues the hardware/driver + * supports. If max_queues is 0, the argument is + * ignored. + * + * Calculates the number of queues to be used for a multiqueue + * device based on the number of possible CPUs. + */ +unsigned int blk_mq_num_possible_queues(unsigned int max_queues) +{ + return blk_mq_num_queues(cpu_possible_mask, max_queues); +} +EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues); + +/** + * blk_mq_num_online_queues - Calc nr of queues for multiqueue devices + * @max_queues: The maximum number of queues the hardware/driver + * supports. If max_queues is 0, the argument is + * ignored. + * + * Calculates the number of queues to be used for a multiqueue + * device based on the number of online CPUs. + */ +unsigned int blk_mq_num_online_queues(unsigned int max_queues) +{ + return blk_mq_num_queues(cpu_online_mask, max_queues); +} +EXPORT_SYMBOL_GPL(blk_mq_num_online_queues); + void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { const struct cpumask *masks; - unsigned int queue, cpu; + unsigned int queue, cpu, nr_masks; - masks = group_cpus_evenly(qmap->nr_queues); + masks = group_cpus_evenly(qmap->nr_queues, &nr_masks); if (!masks) { for_each_possible_cpu(cpu) qmap->mq_map[cpu] = qmap->queue_offset; @@ -29,7 +69,7 @@ void blk_mq_map_queues(struct blk_mq_queue_map *qmap) } for (queue = 0; queue < qmap->nr_queues; queue++) { - for_each_cpu(cpu, &masks[queue]) + for_each_cpu(cpu, &masks[queue % nr_masks]) qmap->mq_map[cpu] = qmap->queue_offset + queue; } kfree(masks); diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 82bae475dfa4..ad283017caef 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include <linux/blk-mq-dma.h> #include "blk.h" struct phys_vec { @@ -61,6 +62,166 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, return true; } +/* + * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page + * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so + * we need to ensure our segments are aligned to this as well. + * + * Note that there is no point in using the slightly more complicated IOVA based + * path for single segment mappings. + */ +static inline bool blk_can_dma_map_iova(struct request *req, + struct device *dma_dev) +{ + return !((queue_virt_boundary(req->q) + 1) & + dma_get_merge_boundary(dma_dev)); +} + +static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) +{ + iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); + iter->len = vec->len; + return true; +} + +static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, + struct blk_dma_iter *iter, struct phys_vec *vec) +{ + iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), + offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); + if (dma_mapping_error(dma_dev, iter->addr)) { + iter->status = BLK_STS_RESOURCE; + return false; + } + iter->len = vec->len; + return true; +} + +static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter, + struct phys_vec *vec) +{ + enum dma_data_direction dir = rq_dma_dir(req); + unsigned int mapped = 0; + int error; + + iter->addr = state->addr; + iter->len = dma_iova_size(state); + + do { + error = dma_iova_link(dma_dev, state, vec->paddr, mapped, + vec->len, dir, 0); + if (error) + break; + mapped += vec->len; + } while (blk_map_iter_next(req, &iter->iter, vec)); + + error = dma_iova_sync(dma_dev, state, 0, mapped); + if (error) { + iter->status = errno_to_blk_status(error); + return false; + } + + return true; +} + +/** + * blk_rq_dma_map_iter_start - map the first DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the + * caller and don't need to be initialized. @state needs to be stored for use + * at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + unsigned int total_len = blk_rq_payload_bytes(req); + struct phys_vec vec; + + iter->iter.bio = req->bio; + iter->iter.iter = req->bio->bi_iter; + memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); + iter->status = BLK_STS_OK; + + /* + * Grab the first segment ASAP because we'll need it to check for P2P + * transfers. + */ + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + req->cmd_flags &= ~REQ_P2PDMA; + break; + default: + iter->status = BLK_STS_INVAL; + return false; + } + } + + if (blk_can_dma_map_iova(req, dma_dev) && + dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) + return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); + +/** + * blk_rq_dma_map_iter_next - map the next DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next mapping after a previous call to + * blk_rq_dma_map_iter_start(). See there for a detailed description of the + * arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); + static inline struct scatterlist * blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 4806b867e37d..9692fa4c3ef2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -883,7 +883,8 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); @@ -982,7 +983,8 @@ bool blk_update_request(struct request *req, blk_status_t error, /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } @@ -3169,8 +3171,10 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) - goto queue_exit; + if (bio_needs_zone_write_plugging(bio)) { + if (blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + } new_request: if (rq) { @@ -4966,6 +4970,60 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +/* + * Switch back to the elevator type stored in the xarray. + */ +static void blk_mq_elv_switch_back(struct request_queue *q, + struct xarray *elv_tbl) +{ + struct elevator_type *e = xa_load(elv_tbl, q->id); + + /* The elv_update_nr_hw_queues unfreezes the queue. */ + elv_update_nr_hw_queues(q, e); + + /* Drop the reference acquired in blk_mq_elv_switch_none. */ + if (e) + elevator_put(e); +} + +/* + * Stores elevator type in xarray and set current elevator to none. It uses + * q->id as an index to store the elevator type into the xarray. + */ +static int blk_mq_elv_switch_none(struct request_queue *q, + struct xarray *elv_tbl) +{ + int ret = 0; + + lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); + + /* + * Accessing q->elevator without holding q->elevator_lock is safe here + * because we're called from nr_hw_queue update which is protected by + * set->update_nr_hwq_lock in the writer context. So, scheduler update/ + * switch code (which acquires the same lock in the reader context) + * can't run concurrently. + */ + if (q->elevator) { + + ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); + if (WARN_ON_ONCE(ret)) + return ret; + + /* + * Before we switch elevator to 'none', take a reference to + * the elevator module so that while nr_hw_queue update is + * running, no one can remove elevator module. We'd put the + * reference to elevator module later when we switch back + * elevator. + */ + __elevator_get(q->elevator->type); + + elevator_set_none(q); + } + return ret; +} + static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { @@ -4973,6 +5031,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; + struct xarray elv_tbl; lockdep_assert_held(&set->tag_list_lock); @@ -4984,6 +5043,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); + + xa_init(&elv_tbl); + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); @@ -4992,11 +5054,17 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue_nomemsave(q); - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue_nomemrestore(q); - goto reregister; - } + /* + * Switch IO scheduler to 'none', cleaning up the data associated + * with the previous scheduler. We will switch back once we are done + * updating the new sw to hw queue mappings. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + if (blk_mq_elv_switch_none(q, &elv_tbl)) + goto switch_back; + + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) + goto switch_back; fallback: blk_mq_update_queue_map(set); @@ -5016,12 +5084,11 @@ fallback: } blk_mq_map_swqueue(q); } - - /* elv_update_nr_hw_queues() unfreeze queue for us */ +switch_back: + /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) - elv_update_nr_hw_queues(q); + blk_mq_elv_switch_back(q, &elv_tbl); -reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); @@ -5029,6 +5096,9 @@ reregister: blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } + + xa_destroy(&elv_tbl); + memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ diff --git a/block/blk-settings.c b/block/blk-settings.c index a000daafbfb4..91449147bae9 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -14,6 +14,8 @@ #include <linux/jiffies.h> #include <linux/gfp.h> #include <linux/dma-mapping.h> +#include <linux/t10-pi.h> +#include <linux/crc64.h> #include "blk.h" #include "blk-rq-qos.h" @@ -50,6 +52,8 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; + lim->max_hw_wzeroes_unmap_sectors = UINT_MAX; + lim->max_user_wzeroes_unmap_sectors = UINT_MAX; lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; } @@ -114,7 +118,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; - if (!bi->tuple_size) { + if (!bi->metadata_size) { if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { pr_warn("invalid PI settings.\n"); @@ -135,6 +139,42 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return -EINVAL; } + if (bi->pi_tuple_size > bi->metadata_size) { + pr_warn("pi_tuple_size (%u) exceeds metadata_size (%u)\n", + bi->pi_tuple_size, + bi->metadata_size); + return -EINVAL; + } + + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_NONE: + if (bi->pi_tuple_size) { + pr_warn("pi_tuple_size must be 0 when checksum type \ + is none\n"); + return -EINVAL; + } + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) { + pr_warn("pi_tuple_size mismatch for T10 PI: expected \ + %zu, got %u\n", + sizeof(struct t10_pi_tuple), + bi->pi_tuple_size); + return -EINVAL; + } + break; + case BLK_INTEGRITY_CSUM_CRC64: + if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) { + pr_warn("pi_tuple_size mismatch for CRC64 PI: \ + expected %zu, got %u\n", + sizeof(struct crc64_pi_tuple), + bi->pi_tuple_size); + return -EINVAL; + } + break; + } + if (!bi->interval_exp) bi->interval_exp = ilog2(lim->logical_block_size); @@ -181,6 +221,8 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim) static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; + unsigned int atomic_write_hw_max_sectors = + lim->atomic_write_hw_max >> SECTOR_SHIFT; if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; @@ -202,6 +244,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) lim->atomic_write_hw_max)) goto unsupported; + if (WARN_ON_ONCE(lim->chunk_sectors && + atomic_write_hw_max_sectors > lim->chunk_sectors)) + goto unsupported; + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; if (boundary_sectors) { @@ -333,6 +379,12 @@ int blk_validate_limits(struct queue_limits *lim) if (!lim->max_segments) lim->max_segments = BLK_MAX_SEGMENTS; + if (lim->max_hw_wzeroes_unmap_sectors && + lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors) + return -EINVAL; + lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors, + lim->max_user_wzeroes_unmap_sectors); + lim->max_discard_sectors = min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); @@ -418,10 +470,11 @@ int blk_set_default_limits(struct queue_limits *lim) { /* * Most defaults are set by capping the bounds in blk_validate_limits, - * but max_user_discard_sectors is special and needs an explicit - * initialization to the max value here. + * but these limits are special and need an explicit initialization to + * the max value here. */ lim->max_user_discard_sectors = UINT_MAX; + lim->max_user_wzeroes_unmap_sectors = UINT_MAX; return blk_validate_limits(lim); } @@ -589,41 +642,50 @@ static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, return true; } - -/* Check stacking of first bottom device */ -static bool blk_stack_atomic_writes_head(struct queue_limits *t, - struct queue_limits *b) +static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) { - if (b->atomic_write_hw_boundary && - !blk_stack_atomic_writes_boundary_head(t, b)) - return false; + unsigned int chunk_bytes; - if (t->io_min <= SECTOR_SIZE) { - /* No chunk sectors, so use bottom device values directly */ - t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; - t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; - t->atomic_write_hw_max = b->atomic_write_hw_max; - return true; - } + if (!t->chunk_sectors) + return; + + /* + * If chunk sectors is so large that its value in bytes overflows + * UINT_MAX, then just shift it down so it definitely will fit. + * We don't support atomic writes of such a large size anyway. + */ + if (check_shl_overflow(t->chunk_sectors, SECTOR_SHIFT, &chunk_bytes)) + chunk_bytes = t->chunk_sectors; /* * Find values for limits which work for chunk size. * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk - * size (t->io_min), as chunk size is not restricted to a power-of-2. + * size, as the chunk size is not restricted to a power-of-2. * So we need to find highest power-of-2 which works for the chunk * size. - * As an example scenario, we could have b->unit_max = 16K and - * t->io_min = 24K. For this case, reduce t->unit_max to a value - * aligned with both limits, i.e. 8K in this example. + * As an example scenario, we could have t->unit_max = 16K and + * t->chunk_sectors = 24KB. For this case, reduce t->unit_max to a + * value aligned with both limits, i.e. 8K in this example. */ - t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; - while (t->io_min % t->atomic_write_hw_unit_max) - t->atomic_write_hw_unit_max /= 2; + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, + max_pow_of_two_factor(chunk_bytes)); - t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, + t->atomic_write_hw_unit_min = min(t->atomic_write_hw_unit_min, t->atomic_write_hw_unit_max); - t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); + t->atomic_write_hw_max = min(t->atomic_write_hw_max, chunk_bytes); +} +/* Check stacking of first bottom device */ +static bool blk_stack_atomic_writes_head(struct queue_limits *t, + struct queue_limits *b) +{ + if (b->atomic_write_hw_boundary && + !blk_stack_atomic_writes_boundary_head(t, b)) + return false; + + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; + t->atomic_write_hw_max = b->atomic_write_hw_max; return true; } @@ -651,6 +713,7 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t, if (!blk_stack_atomic_writes_head(t, b)) goto unsupported; + blk_stack_atomic_writes_chunk_sectors(t); return; unsupported: @@ -708,6 +771,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); + t->max_user_wzeroes_unmap_sectors = + min(t->max_user_wzeroes_unmap_sectors, + b->max_user_wzeroes_unmap_sectors); + t->max_hw_wzeroes_unmap_sectors = + min(t->max_hw_wzeroes_unmap_sectors, + b->max_hw_wzeroes_unmap_sectors); + t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors, b->max_hw_zone_append_sectors); @@ -875,7 +945,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t, return true; if (ti->flags & BLK_INTEGRITY_STACKED) { - if (ti->tuple_size != bi->tuple_size) + if (ti->metadata_size != bi->metadata_size) goto incompatible; if (ti->interval_exp != bi->interval_exp) goto incompatible; @@ -891,7 +961,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t, ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | (bi->flags & BLK_INTEGRITY_REF_TAG); ti->csum_type = bi->csum_type; - ti->tuple_size = bi->tuple_size; + ti->metadata_size = bi->metadata_size; ti->pi_offset = bi->pi_offset; ti->interval_exp = bi->interval_exp; ti->tag_size = bi->tag_size; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index c611444480b3..396cded255ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -161,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors) +QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors) QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors) @@ -205,6 +207,24 @@ static int queue_max_discard_sectors_store(struct gendisk *disk, return 0; } +static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) +{ + unsigned long max_zeroes_bytes, max_hw_zeroes_bytes; + ssize_t ret; + + ret = queue_var_store(&max_zeroes_bytes, page, count); + if (ret < 0) + return ret; + + max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT; + if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes) + return -EINVAL; + + lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT; + return 0; +} + static int queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, struct queue_limits *lim) @@ -514,6 +534,10 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes"); +QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors, + "write_zeroes_unmap_max_hw_bytes"); +QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors, + "write_zeroes_unmap_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); @@ -662,6 +686,8 @@ static struct attribute *queue_attrs[] = { &queue_atomic_write_unit_min_entry.attr, &queue_atomic_write_unit_max_entry.attr, &queue_max_write_zeroes_sectors_entry.attr, + &queue_max_hw_wzeroes_unmap_sectors_entry.attr, + &queue_max_wzeroes_unmap_sectors_entry.attr, &queue_max_zone_append_sectors_entry.attr, &queue_zone_write_granularity_entry.attr, &queue_rotational_entry.attr, diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 351d659280e1..ef43aaca49f4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -17,6 +17,8 @@ #include <linux/refcount.h> #include <linux/mempool.h> +#include <trace/events/block.h> + #include "blk.h" #include "blk-mq-sched.h" #include "blk-mq-debugfs.h" @@ -177,6 +179,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev) struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); + trace_blkdev_zone_mgmt(&bio, 0); return submit_bio_wait(&bio); } @@ -240,6 +243,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, cond_resched(); } + trace_blkdev_zone_mgmt(bio, nr_sectors); ret = submit_bio_wait(bio); bio_put(bio); @@ -818,6 +822,8 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, * at the tail of the list to preserve the sequential write order. */ bio_list_add(&zwplug->bio_list, bio); + trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, + bio->bi_iter.bi_sector, bio_sectors(bio)); zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; @@ -1116,25 +1122,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { struct block_device *bdev = bio->bi_bdev; - if (!bdev->bd_disk->zone_wplugs_hash) - return false; - - /* - * If the BIO already has the plugging flag set, then it was already - * handled through this path and this is a submission from the zone - * plug bio submit work. - */ - if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) - return false; - - /* - * We do not need to do anything special for empty flush BIOs, e.g - * BIOs such as issued by blkdev_issue_flush(). The is because it is - * the responsibility of the user to first wait for the completion of - * write operations for flush to have any effect on the persistence of - * the written data. - */ - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) return false; /* @@ -1205,6 +1193,20 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_unlock_irqrestore(&zwplug->lock, flags); } +void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) +{ + /* + * For zone append requests, the request sector indicates the location + * at which the BIO data was written. Return this value to the BIO + * issuer through the BIO iter sector. + * For plugged zone writes, which include emulated zone append, we need + * the original BIO sector so that blk_zone_write_plug_bio_endio() can + * lookup the zone write plug. + */ + bio->bi_iter.bi_sector = rq->__sector; + trace_blk_zone_append_update_request_bio(rq); +} + void blk_zone_write_plug_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; @@ -1299,6 +1301,9 @@ again: goto put_zwplug; } + trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, + bio->bi_iter.bi_sector, bio_sectors(bio)); + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { blk_zone_wplug_bio_io_error(zwplug, bio); goto again; diff --git a/block/blk.h b/block/blk.h index 37ec459fe656..76901a39997f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -13,6 +13,15 @@ struct elevator_type; +/* + * Default upper limit for the software max_sectors limit used for regular I/Os. + * This can be increased through sysfs. + * + * This should not be confused with the max_hw_sector limit that is entirely + * controlled by the block device driver, usually based on hardware limits. + */ +#define BLK_DEF_MAX_SECTORS_CAP (SZ_4M >> SECTOR_SHIFT) + #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) #define BLK_MIN_SEGMENT_SIZE 4096 @@ -321,7 +330,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); @@ -467,23 +476,15 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } -void blk_zone_write_plug_bio_merged(struct bio *bio); -void blk_zone_write_plug_init_request(struct request *rq); -static inline void blk_zone_update_request_bio(struct request *rq, - struct bio *bio) +static inline bool blk_req_bio_is_zone_append(struct request *rq, + struct bio *bio) { - /* - * For zone append requests, the request sector indicates the location - * at which the BIO data was written. Return this value to the BIO - * issuer through the BIO iter sector. - * For plugged zone writes, which include emulated zone append, we need - * the original BIO sector so that blk_zone_write_plug_bio_endio() can - * lookup the zone write plug. - */ - if (req_op(rq) == REQ_OP_ZONE_APPEND || - bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) - bio->bi_iter.bi_sector = rq->__sector; + return req_op(rq) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); } +void blk_zone_write_plug_bio_merged(struct bio *bio); +void blk_zone_write_plug_init_request(struct request *rq); +void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio); void blk_zone_write_plug_bio_endio(struct bio *bio); static inline void blk_zone_bio_endio(struct bio *bio) { @@ -516,14 +517,19 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } +static inline bool blk_req_bio_is_zone_append(struct request *req, + struct bio *bio) +{ + return false; +} static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } static inline void blk_zone_write_plug_init_request(struct request *rq) { } -static inline void blk_zone_update_request_bio(struct request *rq, - struct bio *bio) +static inline void blk_zone_append_update_request_bio(struct request *rq, + struct bio *bio) { } static inline void blk_zone_bio_endio(struct bio *bio) diff --git a/block/elevator.c b/block/elevator.c index ab22542e6cf0..88f8f36bed98 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -689,21 +689,21 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q) +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) { struct elv_change_ctx ctx = {}; int ret = -ENODEV; WARN_ON_ONCE(q->mq_freeze_depth == 0); - mutex_lock(&q->elevator_lock); - if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) { - ctx.name = q->elevator->type->elevator_name; + if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { + ctx.name = e->elevator_name; + mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ ret = elevator_switch(q, &ctx); + mutex_unlock(&q->elevator_lock); } - mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx)); @@ -719,7 +719,8 @@ void elevator_set_default(struct request_queue *q) .name = "mq-deadline", .no_uevent = true, }; - int err = 0; + int err; + struct elevator_type *e; /* now we allow to switch elevator */ blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q); @@ -732,12 +733,18 @@ void elevator_set_default(struct request_queue *q) * have multiple queues or mq-deadline is not available, default * to "none". */ - if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 || - blk_mq_is_shared_tags(q->tag_set->flags))) + e = elevator_find_get(ctx.name); + if (!e) + return; + + if ((q->nr_hw_queues == 1 || + blk_mq_is_shared_tags(q->tag_set->flags))) { err = elevator_change(q, &ctx); - if (err < 0) - pr_warn("\"%s\" elevator initialization, failed %d, " - "falling back to \"none\"\n", ctx.name, err); + if (err < 0) + pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", + ctx.name, err); + } + elevator_put(e); } void elevator_set_none(struct request_queue *q) diff --git a/block/fops.c b/block/fops.c index 1309861d4c2c..82451ac8ff25 100644 --- a/block/fops.c +++ b/block/fops.c @@ -496,18 +496,21 @@ static void blkdev_readahead(struct readahead_control *rac) mpage_readahead(rac, blkdev_get_block); } -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int blkdev_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int blkdev_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = block_write_end(pos, len, copied, folio); folio_unlock(folio); folio_put(folio); @@ -537,30 +540,42 @@ static void blkdev_readahead(struct readahead_control *rac) iomap_readahead(rac, &blkdev_iomap_ops); } -static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset, unsigned int len) +static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 offset, unsigned int len, u64 end_pos) { - loff_t isize = i_size_read(inode); + loff_t isize = i_size_read(wpc->inode); if (WARN_ON_ONCE(offset >= isize)) return -EIO; - if (offset >= wpc->iomap.offset && - offset < wpc->iomap.offset + wpc->iomap.length) - return 0; - return blkdev_iomap_begin(inode, offset, isize - offset, - IOMAP_WRITE, &wpc->iomap, NULL); + + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) { + int error; + + error = blkdev_iomap_begin(wpc->inode, offset, isize - offset, + IOMAP_WRITE, &wpc->iomap, NULL); + if (error) + return error; + } + + return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); } static const struct iomap_writeback_ops blkdev_writeback_ops = { - .map_blocks = blkdev_map_blocks, + .writeback_range = blkdev_writeback_range, + .writeback_submit = iomap_ioend_writeback_submit, }; static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct iomap_writepage_ctx wpc = { }; + struct iomap_writepage_ctx wpc = { + .inode = mapping->host, + .wbc = wbc, + .ops = &blkdev_writeback_ops + }; - return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); + return iomap_writepages(&wpc); } const struct address_space_operations def_blk_aops = { @@ -711,7 +726,8 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { - return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL); + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL, + NULL); } /* @@ -841,7 +857,7 @@ reexpand: #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE) + FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) @@ -850,11 +866,19 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; + unsigned int flags; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; + /* + * Don't allow writing zeroes if the device does not enable the + * unmap write zeroes operation. + */ + if ((mode & FALLOC_FL_WRITE_ZEROES) && + !bdev_write_zeroes_unmap_sectors(bdev)) + return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = bdev_nr_bytes(bdev); @@ -877,48 +901,46 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); - /* - * Invalidate the page cache, including dirty pages, for valid - * de-allocate mode calls to fallocate(). - */ switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, - BLKDEV_ZERO_NOUNMAP); + flags = BLKDEV_ZERO_NOUNMAP; break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL, - BLKDEV_ZERO_NOFALLBACK); + flags = BLKDEV_ZERO_NOFALLBACK; + break; + case FALLOC_FL_WRITE_ZEROES: + flags = 0; break; default: error = -EOPNOTSUPP; + goto fail; } + /* + * Invalidate the page cache, including dirty pages, for valid + * de-allocate mode calls to fallocate(). + */ + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); + if (error) + goto fail; + + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, flags); fail: filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); return error; } -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +static int blkdev_mmap_prepare(struct vm_area_desc *desc) { - struct inode *bd_inode = bdev_file_inode(file); + struct file *file = desc->file; - if (bdev_read_only(I_BDEV(bd_inode))) - return generic_file_readonly_mmap(file, vma); + if (bdev_read_only(I_BDEV(bdev_file_inode(file)))) + return generic_file_readonly_mmap_prepare(desc); - return generic_file_mmap(file, vma); + return generic_file_mmap_prepare(desc); } const struct file_operations def_blk_fops = { @@ -928,7 +950,7 @@ const struct file_operations def_blk_fops = { .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = iocb_bio_iopoll, - .mmap = blkdev_mmap, + .mmap_prepare = blkdev_mmap_prepare, .fsync = blkdev_fsync, .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT diff --git a/block/ioctl.c b/block/ioctl.c index e472cc1030c6..f7b0006ca45d 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -13,6 +13,7 @@ #include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/io_uring/cmd.h> +#include <linux/blk-integrity.h> #include <uapi/linux/blkdev.h> #include "blk.h" #include "blk-crypto-internal.h" @@ -644,7 +645,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case IOC_PR_CLEAR: return blkdev_pr_clear(bdev, mode, argp); default: - return -ENOIOCTLCMD; + return blk_get_meta_cap(bdev, cmd, argp); } } diff --git a/block/t10-pi.c b/block/t10-pi.c index 851db518ee5e..0c4ed9702146 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -56,7 +56,7 @@ static void t10_pi_generate(struct blk_integrity_iter *iter, pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } } @@ -105,7 +105,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } @@ -125,7 +125,7 @@ next: static void t10_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -177,7 +177,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -234,7 +234,7 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, put_unaligned_be48(0ULL, pi->ref_tag); iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } } @@ -289,7 +289,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += bi->tuple_size; + iter->prot_buf += bi->metadata_size; iter->seed++; } @@ -299,7 +299,7 @@ next: static void ext_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; @@ -340,7 +340,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->tuple_size; + const int tuple_sz = bi->metadata_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; struct bio *bio; |