diff options
Diffstat (limited to 'include/linux/blkdev.h')
| -rw-r--r-- | include/linux/blkdev.h | 1193 |
1 files changed, 768 insertions, 425 deletions
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 43d4e073b111..72e34acd439c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,8 @@ #include <linux/sbitmap.h> #include <linux/uuid.h> #include <linux/xarray.h> +#include <linux/file.h> +#include <linux/lockdep.h> struct module; struct request_queue; @@ -36,19 +38,14 @@ struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; +struct blk_report_zones_args; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; extern const struct device_type disk_type; -extern struct device_type part_type; -extern struct class block_class; - -/* Must be consistent with blk_mq_poll_stats_bkt() */ -#define BLK_MQ_POLL_STATS_BKTS 16 - -/* Doing classic polling */ -#define BLK_MQ_POLL_CLASSIC -1 +extern const struct device_type part_type; +extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. @@ -110,14 +107,40 @@ enum { struct disk_events; struct badblocks; +enum blk_integrity_checksum { + BLK_INTEGRITY_CSUM_NONE = 0, + BLK_INTEGRITY_CSUM_IP = 1, + BLK_INTEGRITY_CSUM_CRC = 2, + BLK_INTEGRITY_CSUM_CRC64 = 3, +} __packed ; + struct blk_integrity { - const struct blk_integrity_profile *profile; unsigned char flags; - unsigned char tuple_size; + enum blk_integrity_checksum csum_type; + unsigned char metadata_size; + unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; + unsigned char pi_tuple_size; }; +typedef unsigned int __bitwise blk_mode_t; + +/* open for reading */ +#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) +/* open for writing */ +#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) +/* open exclusively (vs other exclusive openers */ +#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) +/* opened with O_NDELAY */ +#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) +/* open for "writes" only for ioctls (specialy hack for floppy.c) */ +#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ +#define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) +/* return partition scanning errors */ +#define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) + struct gendisk { /* * major/first_minor/minors should not be set by any new driver, the @@ -150,6 +173,7 @@ struct gendisk { #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 +#define GD_ZONE_APPEND_USED 7 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ @@ -161,32 +185,25 @@ struct gendisk { struct list_head slave_bdevs; #endif struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ struct disk_events *ev; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct kobject integrity_kobj; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ #ifdef CONFIG_BLK_DEV_ZONED /* - * Zoned block device information for request dispatch control. - * nr_zones is the total number of zones of the device. This is always - * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones - * bits which indicates if a zone is conventional (bit set) or - * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones - * bits which indicates if a zone is write locked, that is, if a write - * request targeting the zone was dispatched. - * - * Reads of this information must be protected with blk_queue_enter() / - * blk_queue_exit(). Modifying this information is only allowed while - * no requests are being processed. See also blk_mq_freeze_queue() and - * blk_mq_unfreeze_queue(). + * Zoned block device information. Reads of this information must be + * protected with blk_queue_enter() / blk_queue_exit(). Modifying this + * information is only allowed while no requests are being processed. + * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). */ unsigned int nr_zones; - unsigned int max_open_zones; - unsigned int max_active_zones; - unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; + unsigned int zone_capacity; + unsigned int last_zone_capacity; + u8 __rcu *zones_cond; + unsigned int zone_wplugs_hash_bits; + atomic_t nr_zone_wplugs; + spinlock_t zone_wplugs_lock; + struct mempool *zone_wplugs_pool; + struct hlist_head *zone_wplugs_hash; + struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -196,18 +213,16 @@ struct gendisk { struct badblocks *bb; struct lockdep_map lockdep_map; u64 diskseq; + blk_mode_t open_mode; /* * Independent sector access ranges. This is always NULL for * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; -}; -static inline bool disk_live(struct gendisk *disk) -{ - return !inode_unhashed(disk->part0->bd_inode); -} + struct mutex rqos_state_mutex; /* rqos state change mutex */ +}; /** * disk_openers - returns how many openers are there for a disk @@ -224,6 +239,19 @@ static inline unsigned int disk_openers(struct gendisk *disk) return atomic_read(&disk->part0->bd_openers); } +/** + * disk_has_partscan - return %true if partition scanning is enabled on a disk + * @disk: disk to check + * + * Returns %true if partitions scanning is enabled for @disk, or %false if + * partition scanning is disabled either permanently or temporarily. + */ +static inline bool disk_has_partscan(struct gendisk *disk) +{ + return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. @@ -244,9 +272,21 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER) + * however we constrain this to what we can validate and test. + */ +#define BLK_MAX_BLOCK_SIZE SZ_64K +#else +#define BLK_MAX_BLOCK_SIZE PAGE_SIZE +#endif + + +/* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { - if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize)) return -EINVAL; return 0; @@ -258,29 +298,79 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } -/* - * Zoned block device models (zoned limit). - * - * Note: This needs to be ordered from the least to the most severe - * restrictions for the inheritance in blk_stack_limits() to work. - */ -enum blk_zoned_model { - BLK_ZONED_NONE = 0, /* Regular block device */ - BLK_ZONED_HA, /* Host-aware zoned block device */ - BLK_ZONED_HM, /* Host-managed zoned block device */ -}; +/* flags set by the driver in queue_limits.features */ +typedef unsigned int __bitwise blk_features_t; + +/* supports a volatile write cache */ +#define BLK_FEAT_WRITE_CACHE ((__force blk_features_t)(1u << 0)) + +/* supports passing on the FUA bit */ +#define BLK_FEAT_FUA ((__force blk_features_t)(1u << 1)) + +/* rotational device (hard drive or floppy) */ +#define BLK_FEAT_ROTATIONAL ((__force blk_features_t)(1u << 2)) + +/* contributes to the random number pool */ +#define BLK_FEAT_ADD_RANDOM ((__force blk_features_t)(1u << 3)) + +/* do disk/partitions IO accounting */ +#define BLK_FEAT_IO_STAT ((__force blk_features_t)(1u << 4)) + +/* don't modify data until writeback is done */ +#define BLK_FEAT_STABLE_WRITES ((__force blk_features_t)(1u << 5)) + +/* always completes in submit context */ +#define BLK_FEAT_SYNCHRONOUS ((__force blk_features_t)(1u << 6)) + +/* supports REQ_NOWAIT */ +#define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 7)) + +/* supports DAX */ +#define BLK_FEAT_DAX ((__force blk_features_t)(1u << 8)) + +/* supports I/O polling */ +#define BLK_FEAT_POLL ((__force blk_features_t)(1u << 9)) + +/* is a zoned device */ +#define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) + +/* supports PCI(e) p2p requests */ +#define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) + +/* skip this queue in blk_mq_(un)quiesce_tagset */ +#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) + +/* undocumented magic for bcache */ +#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ + ((__force blk_features_t)(1u << 15)) + +/* atomic writes enabled */ +#define BLK_FEAT_ATOMIC_WRITES \ + ((__force blk_features_t)(1u << 16)) /* - * BLK_BOUNCE_NONE: never bounce (default) - * BLK_BOUNCE_HIGH: bounce all highmem pages + * Flags automatically inherited when stacking limits. */ -enum blk_bounce { - BLK_BOUNCE_NONE, - BLK_BOUNCE_HIGH, -}; +#define BLK_FEAT_INHERIT_MASK \ + (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \ + BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) + +/* internal flags in queue_limits.flags */ +typedef unsigned int __bitwise blk_flags_t; + +/* do not send FLUSH/FUA commands despite advertising a write cache */ +#define BLK_FLAG_WRITE_CACHE_DISABLED ((__force blk_flags_t)(1u << 0)) + +/* I/O topology is misaligned */ +#define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) + +/* passthrough command IO accounting */ +#define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2)) struct queue_limits { - enum blk_bounce bounce; + blk_features_t features; + blk_flags_t flags; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; @@ -288,7 +378,9 @@ struct queue_limits { unsigned int max_dev_sectors; unsigned int chunk_sectors; unsigned int max_sectors; + unsigned int max_user_sectors; unsigned int max_segment_size; + unsigned int max_fast_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; @@ -296,21 +388,37 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_wzeroes_unmap_sectors; + unsigned int max_hw_wzeroes_unmap_sectors; + unsigned int max_user_wzeroes_unmap_sectors; + unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; unsigned int zone_write_granularity; + /* atomic write limits */ + unsigned int atomic_write_hw_max; + unsigned int atomic_write_max_sectors; + unsigned int atomic_write_hw_boundary; + unsigned int atomic_write_boundary_sectors; + unsigned int atomic_write_hw_unit_min; + unsigned int atomic_write_unit_min; + unsigned int atomic_write_hw_unit_max; + unsigned int atomic_write_unit_max; + unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; - unsigned char misaligned; - unsigned char discard_misaligned; - unsigned char raid_partial_stripes_expensive; - enum blk_zoned_model zoned; + unsigned short max_write_streams; + unsigned int write_stream_granularity; + + unsigned int max_open_zones; + unsigned int max_active_zones; /* * Drivers that set dma_alignment to less than 511 must be prepared to @@ -318,52 +426,28 @@ struct queue_limits { * due to possible offsets. */ unsigned int dma_alignment; + unsigned int dma_pad_mask; + + struct blk_integrity integrity; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model); +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args); -#ifdef CONFIG_BLK_DEV_ZONED +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone); #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); -unsigned int bdev_nr_zones(struct block_device *bdev); -extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, - gfp_t gfp_mask); -int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)); - -extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); -extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); - -#else /* CONFIG_BLK_DEV_ZONED */ - -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - return 0; -} - -static inline int blkdev_report_zones_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} - -#endif /* CONFIG_BLK_DEV_ZONED */ + unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, + sector_t sectors, sector_t nr_sectors); +int blk_revalidate_disk_zones(struct gendisk *disk); /* * Independent access ranges: struct blk_independent_access_range describes @@ -393,61 +477,55 @@ struct blk_independent_access_ranges { }; struct request_queue { - struct request *last_merge; - struct elevator_queue *elevator; - - struct percpu_ref q_usage_counter; + /* + * The queue owner gets to use this for whatever they like. + * ll_rw_blk doesn't touch it. + */ + void *queuedata; - struct blk_queue_stats *stats; - struct rq_qos *rq_qos; + struct elevator_queue *elevator; const struct blk_mq_ops *mq_ops; /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; + /* + * various queue flags, see QUEUE_* below + */ + unsigned long queue_flags; + + unsigned int __data_racy rq_timeout; + unsigned int queue_depth; + refcount_t refs; + /* hw dispatch queues */ - struct xarray hctx_table; unsigned int nr_hw_queues; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; - /* - * The queue owner gets to use this for whatever they like. - * ll_rw_blk doesn't touch it. - */ - void *queuedata; + struct percpu_ref q_usage_counter; + struct lock_class_key io_lock_cls_key; + struct lockdep_map io_lockdep_map; - /* - * various queue flags, see QUEUE_* below - */ - unsigned long queue_flags; - /* - * Number of contexts that have called blk_set_pm_only(). If this - * counter is above zero then only RQF_PM requests are processed. - */ - atomic_t pm_only; + struct lock_class_key q_lock_cls_key; + struct lockdep_map q_lockdep_map; - /* - * ida allocated id for this queue. Used to index queues from - * ioctx. - */ - int id; + struct request *last_merge; spinlock_t queue_lock; - struct gendisk *disk; + int quiesce_depth; - refcount_t refs; + struct gendisk *disk; /* * mq queue kobject */ struct kobject *mq_kobj; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity integrity; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ + struct queue_limits limits; #ifdef CONFIG_PM struct device *dev; @@ -455,23 +533,31 @@ struct request_queue { #endif /* + * Number of contexts that have called blk_set_pm_only(). If this + * counter is above zero then only RQF_PM requests are processed. + */ + atomic_t pm_only; + + struct blk_queue_stats *stats; + struct rq_qos *rq_qos; + struct mutex rq_qos_mutex; + + /* + * ida allocated id for this queue. Used to index queues from + * ioctx. + */ + int id; + + /* * queue settings */ unsigned long nr_requests; /* Max # of requests */ - unsigned int dma_pad_mask; - #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; struct kobject *crypto_kobject; #endif - unsigned int rq_timeout; - int poll_nsec; - - struct blk_stat_callback *poll_cb; - struct blk_rq_stat *poll_stat; - struct timer_list timeout; struct work_struct timeout_work; @@ -484,13 +570,15 @@ struct request_queue { DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); struct blkcg_gq *root_blkg; struct list_head blkg_list; + struct mutex blkcg_mutex; #endif - struct queue_limits limits; + int node; - unsigned int required_elevator_features; + spinlock_t requeue_lock; + struct list_head requeue_list; + struct delayed_work requeue_work; - int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif @@ -498,13 +586,25 @@ struct request_queue { * for flush operations */ struct blk_flush_queue *fq; + struct list_head flush_list; - struct list_head requeue_list; - spinlock_t requeue_lock; - struct delayed_work requeue_work; + /* + * Protects against I/O scheduler switching, particularly when updating + * q->elevator. Since the elevator update code path may also modify q-> + * nr_requests and wbt latency, this lock also protects the sysfs attrs + * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update + * may modify hctx tags, reserved-tags and cpumask, so this lock also + * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking + * order during an elevator or nr_hw_queue update, first freeze the + * queue, then acquire ->elevator_lock. + */ + struct mutex elevator_lock; struct mutex sysfs_lock; - struct mutex sysfs_dir_lock; + /* + * Protects queue limits and also sysfs attribute read_ahead_kb. + */ + struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating @@ -520,6 +620,16 @@ struct request_queue { struct throtl_data *td; #endif struct rcu_head rcu_head; +#ifdef CONFIG_LOCKDEP + struct task_struct *mq_freeze_owner; + int mq_freeze_owner_depth; + /* + * Records disk & queue state in current context, used in unfreeze + * queue + */ + bool mq_freeze_disk_dead; + bool mq_freeze_queue_dying; +#endif wait_queue_head_t mq_freeze_wq; /* * Protect concurrent access to q_usage_counter by @@ -527,8 +637,6 @@ struct request_queue { */ struct mutex mq_freeze_lock; - int quiesce_depth; - struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; @@ -539,63 +647,46 @@ struct request_queue { * Serializes all debugfs metadata operations using the above dentries. */ struct mutex debugfs_mutex; - - bool mq_sysfs_init_done; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ -#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ -#define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ -#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ -#define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ -#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ -#define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ -#define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ -#define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ -#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ -#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ -#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -#define QUEUE_FLAG_WC 17 /* Write back caching */ -#define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ -#define QUEUE_FLAG_DAX 19 /* device supports DAX */ -#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ -#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ -#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ -#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ -#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ -#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ -#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ -#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ -#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ -#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ - -#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ - (1UL << QUEUE_FLAG_SAME_COMP) | \ - (1UL << QUEUE_FLAG_NOWAIT)) +enum { + QUEUE_FLAG_DYING, /* queue being torn down */ + QUEUE_FLAG_NOMERGES, /* disable merge attempts */ + QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */ + QUEUE_FLAG_FAIL_IO, /* fake timeout */ + QUEUE_FLAG_NOXMERGES, /* No extended merges */ + QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */ + QUEUE_FLAG_INIT_DONE, /* queue is initialized */ + QUEUE_FLAG_STATS, /* track IO start and completion times */ + QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */ + QUEUE_FLAG_QUIESCED, /* queue has been quiesced */ + QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ + QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ + QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ + QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ + QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ + QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ + QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_MAX +}; + +#define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); -#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) -#define blk_queue_stable_writes(q) \ - test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) -#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) -#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_zone_resetall(q) \ - test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) -#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -#define blk_queue_pci_p2pdma(q) \ - test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) +#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) +#define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) +#define blk_queue_passthrough_stat(q) \ + ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) +#define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) +#define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME #define blk_queue_rq_alloc_time(q) \ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) @@ -611,7 +702,11 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ - test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags) + ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) +#define blk_queue_disable_wbt(q) \ + test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) +#define blk_queue_no_elv_switch(q) \ + test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); @@ -639,29 +734,10 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q) } #endif -static inline enum blk_zoned_model -blk_queue_zoned_model(struct request_queue *q) -{ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) - return q->limits.zoned; - return BLK_ZONED_NONE; -} - static inline bool blk_queue_is_zoned(struct request_queue *q) { - switch (blk_queue_zoned_model(q)) { - case BLK_ZONED_HA: - case BLK_ZONED_HM: - return true; - default: - return false; - } -} - -#ifdef CONFIG_BLK_DEV_ZONED -static inline unsigned int disk_nr_zones(struct gendisk *disk) -{ - return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0; + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + (q->limits.features & BLK_FEAT_ZONED); } static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) @@ -671,60 +747,15 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - if (!blk_queue_is_zoned(disk->queue)) - return false; - if (!disk->conv_zones_bitmap) - return true; - return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); -} - -static inline void disk_set_max_open_zones(struct gendisk *disk, - unsigned int max_open_zones) -{ - disk->max_open_zones = max_open_zones; -} - -static inline void disk_set_max_active_zones(struct gendisk *disk, - unsigned int max_active_zones) -{ - disk->max_active_zones = max_active_zones; -} - -static inline unsigned int bdev_max_open_zones(struct block_device *bdev) -{ - return bdev->bd_disk->max_open_zones; -} - -static inline unsigned int bdev_max_active_zones(struct block_device *bdev) -{ - return bdev->bd_disk->max_active_zones; -} - -#else /* CONFIG_BLK_DEV_ZONED */ -static inline unsigned int disk_nr_zones(struct gendisk *disk) -{ - return 0; -} -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - return false; -} -static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) -{ - return 0; -} static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { - return 0; + return bdev->bd_disk->queue->limits.max_open_zones; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { - return 0; + return bdev->bd_disk->queue->limits.max_active_zones; } -#endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_depth(struct request_queue *q) { @@ -744,6 +775,9 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) +int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode); int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); static inline int __must_check add_disk(struct gendisk *disk) @@ -755,19 +789,40 @@ void invalidate_disk(struct gendisk *disk); void set_disk_ro(struct gendisk *disk, bool read_only); void disk_uevent(struct gendisk *disk, enum kobject_action action); -static inline int get_disk_ro(struct gendisk *disk) +static inline u8 bdev_partno(const struct block_device *bdev) { - return disk->part0->bd_read_only || + return atomic_read(&bdev->__bd_flags) & BD_PARTNO; +} + +static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag) +{ + return atomic_read(&bdev->__bd_flags) & flag; +} + +static inline void bdev_set_flag(struct block_device *bdev, unsigned flag) +{ + atomic_or(flag, &bdev->__bd_flags); +} + +static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) +{ + atomic_andnot(flag, &bdev->__bd_flags); +} + +static inline bool get_disk_ro(struct gendisk *disk) +{ + return bdev_test_flag(disk->part0, BD_READ_ONLY) || test_bit(GD_READ_ONLY, &disk->state); } -static inline int bdev_read_only(struct block_device *bdev) +static inline bool bdev_read_only(struct block_device *bdev) { - return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); + return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); } bool set_capacity_and_notify(struct gendisk *disk, sector_t size); -bool disk_force_media_change(struct gendisk *disk, unsigned int events); +void disk_force_media_change(struct gendisk *disk); +void bdev_mark_dead(struct block_device *bdev, bool surprise); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); @@ -798,25 +853,137 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) (sb->s_blocksize_bits - SECTOR_SHIFT); } +#ifdef CONFIG_BLK_DEV_ZONED +static inline unsigned int disk_nr_zones(struct gendisk *disk) +{ + return disk->nr_zones; +} + +/** + * bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone + * write plugging + * @bio: The BIO being submitted + * + * Return true whenever @bio execution needs to be handled through zone + * write plugging (using blk_zone_plug_bio()). Return false otherwise. + */ +static inline bool bio_needs_zone_write_plugging(struct bio *bio) +{ + enum req_op op = bio_op(bio); + + /* + * Only zoned block devices have a zone write plug hash table. But not + * all of them have one (e.g. DM devices may not need one). + */ + if (!bio->bi_bdev->bd_disk->zone_wplugs_hash) + return false; + + /* Only write operations need zone write plugging. */ + if (!op_is_write(op)) + return false; + + /* Ignore empty flush */ + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + + /* Ignore BIOs that already have been handled by zone write plugging. */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return false; + + /* + * All zone write operations must be handled through zone write plugging + * using blk_zone_plug_bio(). + */ + switch (op) { + case REQ_OP_ZONE_APPEND: + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + return true; + default: + return false; + } +} + +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); + +/** + * disk_zone_capacity - returns the zone capacity of zone containing @sector + * @disk: disk to work with + * @sector: sector number within the querying zone + * + * Returns the zone capacity of a zone containing @sector. @sector can be any + * sector in the zone. + */ +static inline unsigned int disk_zone_capacity(struct gendisk *disk, + sector_t sector) +{ + sector_t zone_sectors = disk->queue->limits.chunk_sectors; + + if (sector + zone_sectors >= get_capacity(disk)) + return disk->last_zone_capacity; + return disk->zone_capacity; +} +static inline unsigned int bdev_zone_capacity(struct block_device *bdev, + sector_t pos) +{ + return disk_zone_capacity(bdev->bd_disk, pos); +} + +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector); + +#else /* CONFIG_BLK_DEV_ZONED */ +static inline unsigned int disk_nr_zones(struct gendisk *disk) +{ + return 0; +} + +static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + return false; +} + +static inline bool bio_needs_zone_write_plugging(struct bio *bio) +{ + return false; +} + +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + return disk_nr_zones(bdev->bd_disk); +} + int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure + * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * + * Returns an ERR_PTR on error, else the allocated disk. + * * Context: can sleep */ -#define blk_alloc_disk(node_id) \ +#define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ - __blk_alloc_disk(node_id, &__key); \ + __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, @@ -825,8 +992,7 @@ int __register_blkdev(unsigned int major, const char *name, __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); -bool bdev_check_media_change(struct block_device *bdev); -int __invalidate_device(struct block_device *bdev, bool kill_dirty); +bool disk_check_media_change(struct gendisk *disk); void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED @@ -846,13 +1012,14 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, dev_t part_devt(struct gendisk *disk, u8 partno); void inc_diskseq(struct gendisk *disk); -dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); @@ -864,11 +1031,10 @@ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +const char *blk_status_to_str(blk_status_t status); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) -/* do not sleep to wait for the expected completion time */ -#define BLK_POLL_NOSLEEP (1 << 1) int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags); @@ -886,86 +1052,111 @@ static inline unsigned int bio_zone_no(struct bio *bio) return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); } -static inline unsigned int bio_zone_is_seq(struct bio *bio) +static inline bool bio_straddles_zones(struct bio *bio) { - return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); + return bio_sectors(bio) && + bio_zone_no(bio) != + disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); } /* - * Return how much of the chunk is left to be used for I/O at a given offset. + * Return how much within the boundary is left to be used for I/O at a given + * offset. + */ +static inline unsigned int blk_boundary_sectors_left(sector_t offset, + unsigned int boundary_sectors) +{ + if (unlikely(!is_power_of_2(boundary_sectors))) + return boundary_sectors - sector_div(offset, boundary_sectors); + return boundary_sectors - (offset & (boundary_sectors - 1)); +} + +/** + * queue_limits_start_update - start an atomic update of queue limits + * @q: queue to update + * + * This functions starts an atomic update of the queue limits. It takes a lock + * to prevent other updates and returns a snapshot of the current limits that + * the caller can modify. The caller must call queue_limits_commit_update() + * to finish the update. + * + * Context: process context. */ -static inline unsigned int blk_chunk_sectors_left(sector_t offset, - unsigned int chunk_sectors) +static inline struct queue_limits +queue_limits_start_update(struct request_queue *q) +{ + mutex_lock(&q->limits_lock); + return q->limits; +} +int queue_limits_commit_update_frozen(struct request_queue *q, + struct queue_limits *lim); +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim); +int queue_limits_set(struct request_queue *q, struct queue_limits *lim); +int blk_validate_limits(struct queue_limits *lim); + +/** + * queue_limits_cancel_update - cancel an atomic update of queue limits + * @q: queue to update + * + * This functions cancels an atomic update of the queue limits started by + * queue_limits_start_update() and should be used when an error occurs after + * starting update. + */ +static inline void queue_limits_cancel_update(struct request_queue *q) +{ + mutex_unlock(&q->limits_lock); +} + +/* + * These helpers are for drivers that have sloppy feature negotiation and might + * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O + * completion handler when the device returned an indicator that the respective + * feature is not actually supported. They are racy and the driver needs to + * cope with that. Try to avoid this scheme if you can. + */ +static inline void blk_queue_disable_discard(struct request_queue *q) +{ + q->limits.max_discard_sectors = 0; +} + +static inline void blk_queue_disable_secure_erase(struct request_queue *q) +{ + q->limits.max_secure_erase_sectors = 0; +} + +static inline void blk_queue_disable_write_zeroes(struct request_queue *q) { - if (unlikely(!is_power_of_2(chunk_sectors))) - return chunk_sectors - sector_div(offset, chunk_sectors); - return chunk_sectors - (offset & (chunk_sectors - 1)); + q->limits.max_write_zeroes_sectors = 0; + q->limits.max_wzeroes_unmap_sectors = 0; } /* * Access functions for manipulating queue properties */ -void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce limit); -extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); -extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); -extern void blk_queue_max_segments(struct request_queue *, unsigned short); -extern void blk_queue_max_discard_segments(struct request_queue *, - unsigned short); -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors); -extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); -extern void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors); -extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_same_sectors); -extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); -extern void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors); -extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size); -extern void blk_queue_alignment_offset(struct request_queue *q, - unsigned int alignment); -void disk_update_readahead(struct gendisk *disk); -extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); -extern void blk_queue_io_min(struct request_queue *q, unsigned int min); -extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); -extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); -extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset); -extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); -extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); -extern void blk_queue_dma_alignment(struct request_queue *, int); -extern void blk_queue_update_dma_alignment(struct request_queue *, int); +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); -extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars); -/* - * Elevator features for blk_queue_required_elevator_features: - */ -/* Supports zoned block devices sequential write constraint */ -#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) - -extern void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features); -extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, - struct device *dev); - bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); void blk_mark_disk_dead(struct gendisk *disk); +struct rq_list { + struct request *head; + struct request *tail; +}; + #ifdef CONFIG_BLOCK /* * blk_plug permits building a queue of related requests by holding the I/O @@ -979,17 +1170,17 @@ void blk_mark_disk_dead(struct gendisk *disk); * blk_flush_plug() is called. */ struct blk_plug { - struct request *mq_list; /* blk-mq requests */ + struct rq_list mq_list; /* blk-mq requests */ /* if ios_left is > 1, we can batch tag/rq allocations */ - struct request *cached_rq; + struct rq_list cached_rqs; + u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; bool multiple_queues; bool has_elevator; - bool nowait; struct list_head cb_list; /* md requires an unplug callback */ }; @@ -1014,6 +1205,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -1037,6 +1240,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; @@ -1059,6 +1266,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ +#define BLKDEV_ZERO_KILLABLE (1 << 2) /* interruptible by fatal signals */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, @@ -1089,17 +1297,21 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, static inline bool bdev_is_partition(struct block_device *bdev) { - return bdev->bd_partno; + return bdev_partno(bdev) != 0; } enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, - BLK_DEF_MAX_SECTORS = 2560, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; +static inline struct queue_limits *bdev_limits(struct block_device *bdev) +{ + return &bdev_get_queue(bdev)->limits; +} + static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; @@ -1140,18 +1352,20 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) +static inline bool queue_emulates_zone_append(struct request_queue *q) { + return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; +} - const struct queue_limits *l = &q->limits; - - return min(l->max_zone_append_sectors, l->max_sectors); +static inline bool bdev_emulates_zone_append(struct block_device *bdev) +{ + return queue_emulates_zone_append(bdev_get_queue(bdev)); } static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { - return queue_max_zone_append_sectors(bdev_get_queue(bdev)); + return bdev_limits(bdev)->max_zone_append_sectors; } static inline unsigned int bdev_max_segments(struct block_device *bdev) @@ -1159,14 +1373,16 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev) return queue_max_segments(bdev_get_queue(bdev)); } -static inline unsigned queue_logical_block_size(const struct request_queue *q) +static inline unsigned short bdev_max_write_streams(struct block_device *bdev) { - int retval = 512; - - if (q && q->limits.logical_block_size) - retval = q->limits.logical_block_size; + if (bdev_is_partition(bdev)) + return 0; + return bdev_limits(bdev)->max_write_streams; +} - return retval; +static inline unsigned queue_logical_block_size(const struct request_queue *q) +{ + return q->limits.logical_block_size; } static inline unsigned int bdev_logical_block_size(struct block_device *bdev) @@ -1189,7 +1405,7 @@ static inline unsigned int queue_io_min(const struct request_queue *q) return q->limits.io_min; } -static inline int bdev_io_min(struct block_device *bdev) +static inline unsigned int bdev_io_min(struct block_device *bdev) { return queue_io_min(bdev_get_queue(bdev)); } @@ -1199,7 +1415,7 @@ static inline unsigned int queue_io_opt(const struct request_queue *q) return q->limits.io_opt; } -static inline int bdev_io_opt(struct block_device *bdev) +static inline unsigned int bdev_io_opt(struct block_device *bdev) { return queue_io_opt(bdev_get_queue(bdev)); } @@ -1221,28 +1437,29 @@ unsigned int bdev_discard_alignment(struct block_device *bdev); static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_discard_sectors; + return bdev_limits(bdev)->max_discard_sectors; } static inline unsigned int bdev_discard_granularity(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.discard_granularity; + return bdev_limits(bdev)->discard_granularity; } static inline unsigned int bdev_max_secure_erase_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_secure_erase_sectors; + return bdev_limits(bdev)->max_secure_erase_sectors; } static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return q->limits.max_write_zeroes_sectors; + return bdev_limits(bdev)->max_write_zeroes_sectors; +} - return 0; +static inline unsigned int +bdev_write_zeroes_unmap_sectors(struct block_device *bdev) +{ + return bdev_limits(bdev)->max_wzeroes_unmap_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) @@ -1250,68 +1467,122 @@ static inline bool bdev_nonrot(struct block_device *bdev) return blk_queue_nonrot(bdev_get_queue(bdev)); } +static inline bool bdev_synchronous(struct block_device *bdev) +{ + return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; +} + static inline bool bdev_stable_writes(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_STABLE_WRITES, - &bdev_get_queue(bdev)->queue_flags); + struct request_queue *q = bdev_get_queue(bdev); + + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) + return true; + return q->limits.features & BLK_FEAT_STABLE_WRITES; +} + +static inline bool blk_queue_write_cache(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_WRITE_CACHE) && + !(q->limits.flags & BLK_FLAG_WRITE_CACHE_DISABLED); } static inline bool bdev_write_cache(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); + return blk_queue_write_cache(bdev_get_queue(bdev)); } static inline bool bdev_fua(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); + return bdev_limits(bdev)->features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); + return bdev->bd_disk->queue->limits.features & BLK_FEAT_NOWAIT; } -static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) +static inline bool bdev_is_zoned(struct block_device *bdev) +{ + return blk_queue_is_zoned(bdev_get_queue(bdev)); +} + +static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) +{ + return disk_zone_no(bdev->bd_disk, sec); +} + +static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - if (q) - return blk_queue_zoned_model(q); + if (!blk_queue_is_zoned(q)) + return 0; + return q->limits.chunk_sectors; +} + +static inline sector_t bdev_zone_start(struct block_device *bdev, + sector_t sector) +{ + return sector & ~(bdev_zone_sectors(bdev) - 1); +} - return BLK_ZONED_NONE; +static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, + sector_t sector) +{ + return sector & (bdev_zone_sectors(bdev) - 1); } -static inline bool bdev_is_zoned(struct block_device *bdev) +static inline sector_t bio_offset_from_zone_start(struct bio *bio) { - struct request_queue *q = bdev_get_queue(bdev); + return bdev_offset_from_zone_start(bio->bi_bdev, + bio->bi_iter.bi_sector); +} - if (q) - return blk_queue_is_zoned(q); +static inline bool bdev_is_zone_start(struct block_device *bdev, + sector_t sector) +{ + return bdev_offset_from_zone_start(bdev, sector) == 0; +} - return false; +/* Check whether @sector is a multiple of the zone size. */ +static inline bool bdev_is_zone_aligned(struct block_device *bdev, + sector_t sector) +{ + return bdev_is_zone_start(bdev, sector); } -static inline bool bdev_op_is_zoned_write(struct block_device *bdev, - blk_opf_t op) +int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask); + +static inline unsigned int queue_dma_alignment(const struct request_queue *q) { - if (!bdev_is_zoned(bdev)) - return false; + return q->limits.dma_alignment; +} - return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; +static inline unsigned int +queue_atomic_write_unit_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_max; } -static inline sector_t bdev_zone_sectors(struct block_device *bdev) +static inline unsigned int +queue_atomic_write_unit_min_bytes(const struct request_queue *q) { - struct request_queue *q = bdev_get_queue(bdev); + return q->limits.atomic_write_unit_min; +} - if (!blk_queue_is_zoned(q)) - return 0; - return q->limits.chunk_sectors; +static inline unsigned int +queue_atomic_write_boundary_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT; } -static inline int queue_dma_alignment(const struct request_queue *q) +static inline unsigned int +queue_atomic_write_max_bytes(const struct request_queue *q) { - return q ? q->limits.dma_alignment : 511; + return q->limits.atomic_write_max_sectors << SECTOR_SHIFT; } static inline unsigned int bdev_dma_alignment(struct block_device *bdev) @@ -1319,17 +1590,17 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev) return queue_dma_alignment(bdev_get_queue(bdev)); } -static inline bool bdev_iter_is_aligned(struct block_device *bdev, - struct iov_iter *iter) +static inline unsigned int +blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { - return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), - bdev_logical_block_size(bdev) - 1); + return lim->dma_alignment | lim->dma_pad_mask; } -static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, +static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { - unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; + unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); + return !(addr & alignment) && !(len & alignment); } @@ -1339,11 +1610,6 @@ static inline unsigned int blksize_bits(unsigned int size) return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT; } -static inline unsigned int block_size(struct block_device *bdev) -{ - return 1 << bdev->bd_inode->i_blkbits; -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); @@ -1374,27 +1640,27 @@ enum blk_unique_id { BLK_UID_NAA = 3, }; -#define NFL4_UFLG_MASK 0x0000003F - struct block_device_operations { void (*submit_bio)(struct bio *bio); int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); - int (*open) (struct block_device *, fmode_t); - void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op); - int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); + int (*open)(struct gendisk *disk, blk_mode_t mode); + void (*release)(struct gendisk *disk); + int (*ioctl)(struct block_device *bdev, blk_mode_t mode, + unsigned cmd, unsigned long arg); + int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode, + unsigned cmd, unsigned long arg); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], @@ -1411,16 +1677,12 @@ struct block_device_operations { }; #ifdef CONFIG_COMPAT -extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, +extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t, unsigned int, unsigned long); #else #define blkdev_compat_ptr_ioctl NULL #endif -extern int bdev_read_page(struct block_device *, sector_t, struct page *); -extern int bdev_write_page(struct block_device *, sector_t, struct page *, - struct writeback_control *); - static inline void blk_wake_io_task(struct task_struct *waiter) { /* @@ -1434,11 +1696,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } -unsigned long bdev_start_io_acct(struct block_device *bdev, - unsigned int sectors, enum req_op op, +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, - unsigned long start_time); + unsigned int sectors, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, @@ -1454,8 +1715,8 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } -int bdev_read_only(struct block_device *bdev); -int set_blocksize(struct block_device *bdev, int size); +int bdev_validate_blocksize(struct block_device *bdev, int block_size); +int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); @@ -1469,22 +1730,52 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #define BLKDEV_MAJOR_MAX 0 #endif -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder); -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); -int bd_prepare_to_claim(struct block_device *bdev, void *holder); -void bd_abort_claiming(struct block_device *bdev, void *holder); -void blkdev_put(struct block_device *bdev, fmode_t mode); +struct blk_holder_ops { + void (*mark_dead)(struct block_device *bdev, bool surprise); -/* just for blk-cgroup, don't use elsewhere */ -struct block_device *blkdev_get_no_open(dev_t dev); -void blkdev_put_no_open(struct block_device *bdev); + /* + * Sync the file system mounted on the block device. + */ + void (*sync)(struct block_device *bdev); + + /* + * Freeze the file system mounted on the block device. + */ + int (*freeze)(struct block_device *bdev); + + /* + * Thaw the file system mounted on the block device. + */ + int (*thaw)(struct block_device *bdev); +}; + +/* + * For filesystems using @fs_holder_ops, the @holder argument passed to + * helpers used to open and claim block devices via + * bd_prepare_to_claim() must point to a superblock. + */ +extern const struct blk_holder_ops fs_holder_ops; + +/* + * Return the correct open flags for blkdev_get_by_* for super block flags + * as stored in sb->s_flags. + */ +#define sb_open_mode(flags) \ + (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ + (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) + +struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops); +struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hops); +int bd_prepare_to_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops); +void bd_abort_claiming(struct block_device *bdev, void *holder); -struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); -void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, - loff_t lend); +struct block_device *file_bdev(struct file *bdev_file); +bool disk_live(struct gendisk *disk); +unsigned int block_size(struct block_device *bdev); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); @@ -1492,8 +1783,9 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); -void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); +void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask); void printk_all_partitions(void); +int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1509,25 +1801,76 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +static inline void bdev_statx(const struct path *path, struct kstat *stat, + u32 request_mask) { } static inline void printk_all_partitions(void) { } +static inline int early_lookup_bdev(const char *pathname, dev_t *dev) +{ + return -EINVAL; +} #endif /* CONFIG_BLOCK */ -int fsync_bdev(struct block_device *bdev); - -int freeze_bdev(struct block_device *bdev); -int thaw_bdev(struct block_device *bdev); +int bdev_freeze(struct block_device *bdev); +int bdev_thaw(struct block_device *bdev); +void bdev_fput(struct file *bdev_file); struct io_comp_batch { - struct request *req_list; + struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); }; +static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, + struct queue_limits *limits) +{ + unsigned int alignment = max(limits->atomic_write_hw_unit_min, + limits->atomic_write_hw_boundary); + + return IS_ALIGNED(sector, alignment >> SECTOR_SHIFT); +} + +static inline bool bdev_can_atomic_write(struct block_device *bdev) +{ + struct request_queue *bd_queue = bdev->bd_queue; + struct queue_limits *limits = &bd_queue->limits; + + if (!limits->atomic_write_unit_min) + return false; + + if (bdev_is_partition(bdev)) + return blk_atomic_write_start_sect_aligned(bdev->bd_start_sect, + limits); + + return true; +} + +static inline unsigned int +bdev_atomic_write_unit_min_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); +} + +static inline unsigned int +bdev_atomic_write_unit_max_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); +} + +static inline int bio_split_rw_at(struct bio *bio, + const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) +{ + return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ |
